cuda-cccl 0.1.3.1.0.dev1678__cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1860) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +273 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +935 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
  241. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  242. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
  243. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
  244. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
  245. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  247. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  248. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  249. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  259. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  260. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  261. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  262. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  263. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  264. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  265. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  266. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  267. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  268. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
  269. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  270. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  271. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  272. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  273. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  274. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  275. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  276. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  384. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  385. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  386. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  387. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
  388. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  389. cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
  390. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  391. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  392. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  393. cuda/cccl/headers/include/cuda/access_property +26 -0
  394. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  395. cuda/cccl/headers/include/cuda/atomic +27 -0
  396. cuda/cccl/headers/include/cuda/barrier +262 -0
  397. cuda/cccl/headers/include/cuda/bit +29 -0
  398. cuda/cccl/headers/include/cuda/cmath +35 -0
  399. cuda/cccl/headers/include/cuda/discard_memory +60 -0
  400. cuda/cccl/headers/include/cuda/functional +31 -0
  401. cuda/cccl/headers/include/cuda/iterator +34 -0
  402. cuda/cccl/headers/include/cuda/latch +27 -0
  403. cuda/cccl/headers/include/cuda/mdspan +28 -0
  404. cuda/cccl/headers/include/cuda/memory +32 -0
  405. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  406. cuda/cccl/headers/include/cuda/numeric +28 -0
  407. cuda/cccl/headers/include/cuda/pipeline +577 -0
  408. cuda/cccl/headers/include/cuda/ptx +124 -0
  409. cuda/cccl/headers/include/cuda/semaphore +31 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  517. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  518. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  519. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  520. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  521. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  522. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  523. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  524. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  525. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  526. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  527. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  530. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  531. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  532. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  533. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  534. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  535. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  536. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
  537. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  555. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  556. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  557. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  558. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  559. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  560. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  561. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  562. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
  563. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  564. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  565. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  566. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  567. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
  583. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  584. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
  585. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  586. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  587. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  588. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  589. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  590. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  591. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  592. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  593. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  594. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  595. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  596. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
  597. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  598. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
  599. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  600. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  601. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
  602. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  603. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  604. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  605. cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
  606. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  607. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  608. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
  609. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  610. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  611. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  612. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  613. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  614. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  615. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
  616. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  617. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  618. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  619. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  620. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  621. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  622. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  623. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  624. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  625. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  627. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  628. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  629. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  630. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  631. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  632. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  633. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  634. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  635. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  636. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  637. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  638. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  639. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  640. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
  641. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  642. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  643. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  644. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
  645. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
  646. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  647. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
  648. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  650. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  651. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  652. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  653. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  654. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  655. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  656. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  657. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  660. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  661. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  662. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  663. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  664. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
  665. cuda/cccl/headers/include/cuda/std/__format_ +28 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  667. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  668. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  669. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  670. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  671. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  672. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  673. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  674. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  675. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  676. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  677. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  678. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
  679. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  680. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  681. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  682. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  683. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  684. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  685. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  686. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  687. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  688. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  689. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  690. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  691. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  692. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  693. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  694. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  696. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  697. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  698. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  699. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  700. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  701. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  702. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  703. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  704. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  705. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  706. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  707. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  708. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  709. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  710. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  711. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  712. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  713. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  724. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  725. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
  726. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  727. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  728. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
  729. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
  731. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  732. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  733. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  734. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  735. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  736. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
  737. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  739. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
  740. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  741. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  742. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  743. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  744. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  745. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  746. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
  747. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  748. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  749. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  751. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  752. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  753. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  754. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  755. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  756. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  757. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  758. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
  759. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
  760. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  761. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  762. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
  763. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  764. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
  765. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
  766. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  767. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
  768. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  769. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  770. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  771. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  772. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  773. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  774. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  775. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  776. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  777. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  778. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
  779. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  780. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  781. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
  782. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  783. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  784. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  785. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  786. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  787. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  788. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
  789. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
  790. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  791. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  792. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  793. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  794. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  795. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  796. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  797. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  798. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  799. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  800. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  801. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  802. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  803. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  804. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  805. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  806. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  807. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  808. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  809. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  810. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
  811. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  812. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  813. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  814. cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
  815. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
  816. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  817. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  818. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  819. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  820. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
  821. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  822. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  823. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  824. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  825. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  826. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  827. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  828. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  829. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  830. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  831. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  832. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  833. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  834. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  835. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  836. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  837. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
  839. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
  840. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  841. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
  842. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  843. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  844. cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
  845. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  846. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
  847. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  848. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  849. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  850. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  851. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  852. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  853. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  854. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  855. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  856. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  857. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  858. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  859. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  860. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  861. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  862. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  863. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  864. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
  865. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  866. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  867. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  868. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  869. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  870. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  871. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  872. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  873. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  874. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  875. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  876. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1016. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1017. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  1018. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1019. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1020. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1021. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1022. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1023. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1024. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1025. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1026. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1027. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1028. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1029. cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
  1030. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1031. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
  1032. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1034. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1035. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1036. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1037. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1038. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/array +520 -0
  1040. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1041. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  1042. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1043. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1044. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1045. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1046. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1047. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1048. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1049. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1050. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1051. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1052. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1053. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1054. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1055. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1056. cuda/cccl/headers/include/cuda/std/ctime +152 -0
  1057. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1058. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1059. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
  1060. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
  1061. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
  1062. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1063. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1064. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
  1065. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
  1066. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1067. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1068. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1069. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1070. cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
  1071. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1072. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1073. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1074. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1075. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1076. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1077. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1078. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1079. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1080. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1081. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1082. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1083. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1084. cuda/cccl/headers/include/cuda/std/span +640 -0
  1085. cuda/cccl/headers/include/cuda/std/string_view +788 -0
  1086. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1087. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1088. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1089. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1090. cuda/cccl/headers/include/cuda/std/version +245 -0
  1091. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1092. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1093. cuda/cccl/headers/include/cuda/utility +27 -0
  1094. cuda/cccl/headers/include/cuda/version +16 -0
  1095. cuda/cccl/headers/include/cuda/warp +28 -0
  1096. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1097. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1098. cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
  1099. cuda/cccl/headers/include/nv/target +240 -0
  1100. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1101. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1102. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1103. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1104. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1105. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1106. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1107. cuda/cccl/headers/include/thrust/count.h +245 -0
  1108. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1109. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1110. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1111. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1112. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1113. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1114. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1115. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1116. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1117. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1118. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1119. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1120. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1121. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1122. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1123. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1124. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1125. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1126. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1127. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1128. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1129. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1130. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1131. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1132. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1133. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1134. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1135. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1136. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1137. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1138. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1139. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1140. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1141. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1142. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1143. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1144. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1145. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1146. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1147. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1148. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1149. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1150. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1151. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1152. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1153. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1154. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1155. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1156. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1157. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1158. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1159. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1160. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1161. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1162. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1163. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1164. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1165. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1166. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1167. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1168. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1169. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1170. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1171. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1172. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1173. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1174. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1175. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1176. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1177. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1178. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1179. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1180. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1181. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1182. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1183. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1184. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1185. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1186. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1187. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1188. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1189. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1190. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1191. cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
  1192. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1193. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1194. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1195. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1196. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1197. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1198. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1199. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1200. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1201. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1202. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1203. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1204. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1205. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1206. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1207. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1208. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1209. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1210. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1211. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1212. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1213. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1214. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1215. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1216. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1217. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1218. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1219. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1220. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1221. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1222. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1223. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1224. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1225. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1226. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1227. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1228. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1229. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1230. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1231. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1232. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1233. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1234. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1235. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1236. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1237. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1238. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1239. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1240. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1241. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1242. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1243. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1244. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1245. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1246. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1247. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1248. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1249. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1250. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1251. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1252. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1254. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1255. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1256. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1257. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1258. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1259. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1260. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1261. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1262. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1263. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1264. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1265. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1266. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1267. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1268. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1269. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1270. cuda/cccl/headers/include/thrust/find.h +382 -0
  1271. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1272. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1273. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1274. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1275. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1276. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1277. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1278. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1279. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1280. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1281. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1282. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1283. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1284. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1285. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1286. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1287. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1288. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1289. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1290. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1291. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1292. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1293. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1294. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1295. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1296. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1297. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1298. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
  1299. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1300. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1301. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1302. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1303. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1304. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1305. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1306. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1307. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1308. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1309. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1310. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1311. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1312. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1313. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1314. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1315. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1316. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1317. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1318. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1319. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1320. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1321. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1322. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1323. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1324. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1325. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1326. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1327. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1328. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1329. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1330. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1331. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1332. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1333. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1334. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1335. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1336. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1337. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1338. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1339. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1340. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1341. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1342. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1343. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1344. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1345. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1346. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1347. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1348. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1349. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1350. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1351. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1352. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1353. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1354. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1355. cuda/cccl/headers/include/thrust/random.h +120 -0
  1356. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1357. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1358. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1359. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1360. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1361. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1362. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1363. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1364. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1365. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1366. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1377. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1378. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1379. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1380. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1382. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1383. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1384. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1385. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1386. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1388. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1389. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1390. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1391. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1392. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1393. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1394. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1395. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1396. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1397. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1398. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1399. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1400. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1401. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1402. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1403. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1404. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1405. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1406. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1408. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1409. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1410. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1411. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1412. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1413. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1414. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1415. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1416. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1417. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1418. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1446. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
  1447. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1448. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1449. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1450. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1451. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1452. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1453. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1454. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1455. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1456. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1457. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1458. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1459. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
  1460. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1461. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1462. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1463. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1464. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1465. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1466. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1467. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1469. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1470. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1471. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1472. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1473. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1474. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1475. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1476. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1477. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1478. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1479. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
  1480. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1481. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1482. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1483. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1484. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1485. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1486. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1487. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1675. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1702. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1703. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1704. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1705. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1706. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1708. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1709. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1710. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1711. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1712. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1713. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1714. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1715. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1716. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1717. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1718. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1724. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1725. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1726. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1727. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1728. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1730. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1731. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1732. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1733. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1734. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1735. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1736. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1737. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1738. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1739. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1740. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1742. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1767. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1768. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1769. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1770. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1771. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1772. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1773. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1775. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1776. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1777. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1778. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1779. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1780. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1783. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1784. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1785. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1786. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1788. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1789. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1790. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1807. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1808. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1809. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1810. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1811. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1812. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1813. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1814. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1815. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1816. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1817. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1818. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1819. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1820. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1821. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1822. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1823. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1824. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1825. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1826. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1827. cuda/cccl/headers/include/thrust/version.h +93 -0
  1828. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1829. cuda/cccl/headers/include_paths.py +72 -0
  1830. cuda/cccl/parallel/__init__.py +9 -0
  1831. cuda/cccl/parallel/experimental/__init__.py +47 -0
  1832. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1833. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1834. cuda/cccl/parallel/experimental/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1835. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1836. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1837. cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
  1838. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1839. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1840. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1841. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1842. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1843. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1844. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1845. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1846. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1847. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1848. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1849. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1850. cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
  1851. cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
  1852. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1853. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1854. cuda/cccl/parallel/experimental/struct.py +150 -0
  1855. cuda/cccl/parallel/experimental/typing.py +27 -0
  1856. cuda/cccl/py.typed +0 -0
  1857. cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
  1858. cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
  1859. cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
  1860. cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1830 @@
1
+ // This file was automatically generated. Do not edit.
2
+
3
+ #ifndef _CUDA_PTX_GENERATED_ST_H_
4
+ #define _CUDA_PTX_GENERATED_ST_H_
5
+
6
+ /*
7
+ // st.space.b8 [addr], src; // PTX ISA 10, SM_50
8
+ // .space = { .global }
9
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
10
+ __device__ static inline void st(
11
+ cuda::ptx::space_global_t,
12
+ B8* addr,
13
+ B8 src);
14
+ */
15
+ #if __cccl_ptx_isa >= 100
16
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
17
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
18
+ _CCCL_DEVICE static inline void st(space_global_t, _B8* __addr, _B8 __src)
19
+ {
20
+ // __space == space_global (due to parameter type constraint)
21
+ static_assert(sizeof(_B8) == 1, "");
22
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
23
+ asm("st.global.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
24
+ # else
25
+ // Unsupported architectures will have a linker error with a semi-decent error message
26
+ __cuda_ptx_st_is_not_supported_before_SM_50__();
27
+ # endif
28
+ }
29
+ #endif // __cccl_ptx_isa >= 100
30
+
31
+ /*
32
+ // st.space.b16 [addr], src; // PTX ISA 10, SM_50
33
+ // .space = { .global }
34
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
35
+ __device__ static inline void st(
36
+ cuda::ptx::space_global_t,
37
+ B16* addr,
38
+ B16 src);
39
+ */
40
+ #if __cccl_ptx_isa >= 100
41
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
42
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
43
+ _CCCL_DEVICE static inline void st(space_global_t, _B16* __addr, _B16 __src)
44
+ {
45
+ // __space == space_global (due to parameter type constraint)
46
+ static_assert(sizeof(_B16) == 2, "");
47
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
48
+ asm("st.global.b16 [%0], %1;"
49
+ :
50
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
51
+ : "memory");
52
+ # else
53
+ // Unsupported architectures will have a linker error with a semi-decent error message
54
+ __cuda_ptx_st_is_not_supported_before_SM_50__();
55
+ # endif
56
+ }
57
+ #endif // __cccl_ptx_isa >= 100
58
+
59
+ /*
60
+ // st.space.b32 [addr], src; // PTX ISA 10, SM_50
61
+ // .space = { .global }
62
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
63
+ __device__ static inline void st(
64
+ cuda::ptx::space_global_t,
65
+ B32* addr,
66
+ B32 src);
67
+ */
68
+ #if __cccl_ptx_isa >= 100
69
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
70
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
71
+ _CCCL_DEVICE static inline void st(space_global_t, _B32* __addr, _B32 __src)
72
+ {
73
+ // __space == space_global (due to parameter type constraint)
74
+ static_assert(sizeof(_B32) == 4, "");
75
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
76
+ asm("st.global.b32 [%0], %1;"
77
+ :
78
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
79
+ : "memory");
80
+ # else
81
+ // Unsupported architectures will have a linker error with a semi-decent error message
82
+ __cuda_ptx_st_is_not_supported_before_SM_50__();
83
+ # endif
84
+ }
85
+ #endif // __cccl_ptx_isa >= 100
86
+
87
+ /*
88
+ // st.space.b64 [addr], src; // PTX ISA 10, SM_50
89
+ // .space = { .global }
90
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
91
+ __device__ static inline void st(
92
+ cuda::ptx::space_global_t,
93
+ B64* addr,
94
+ B64 src);
95
+ */
96
+ #if __cccl_ptx_isa >= 100
97
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
98
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
99
+ _CCCL_DEVICE static inline void st(space_global_t, _B64* __addr, _B64 __src)
100
+ {
101
+ // __space == space_global (due to parameter type constraint)
102
+ static_assert(sizeof(_B64) == 8, "");
103
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
104
+ asm("st.global.b64 [%0], %1;"
105
+ :
106
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
107
+ : "memory");
108
+ # else
109
+ // Unsupported architectures will have a linker error with a semi-decent error message
110
+ __cuda_ptx_st_is_not_supported_before_SM_50__();
111
+ # endif
112
+ }
113
+ #endif // __cccl_ptx_isa >= 100
114
+
115
+ /*
116
+ // st.space.b128 [addr], src; // PTX ISA 83, SM_70
117
+ // .space = { .global }
118
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
119
+ __device__ static inline void st(
120
+ cuda::ptx::space_global_t,
121
+ B128* addr,
122
+ B128 src);
123
+ */
124
+ #if __cccl_ptx_isa >= 830
125
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_70__();
126
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
127
+ _CCCL_DEVICE static inline void st(space_global_t, _B128* __addr, _B128 __src)
128
+ {
129
+ // __space == space_global (due to parameter type constraint)
130
+ static_assert(sizeof(_B128) == 16, "");
131
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
132
+ asm("{\n\t .reg .b128 B128_src; \n\t"
133
+ "mov.b128 B128_src, {%1, %2}; \n"
134
+ "st.global.b128 [%0], B128_src;\n\t"
135
+ "}"
136
+ :
137
+ : "l"(__as_ptr_gmem(__addr)),
138
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
139
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
140
+ : "memory");
141
+ # else
142
+ // Unsupported architectures will have a linker error with a semi-decent error message
143
+ __cuda_ptx_st_is_not_supported_before_SM_70__();
144
+ # endif
145
+ }
146
+ #endif // __cccl_ptx_isa >= 830
147
+
148
+ /*
149
+ // st.space.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
150
+ // .space = { .global }
151
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
152
+ __device__ static inline void st_L2_cache_hint(
153
+ cuda::ptx::space_global_t,
154
+ B8* addr,
155
+ B8 src,
156
+ uint64_t cache_policy);
157
+ */
158
+ #if __cccl_ptx_isa >= 740
159
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
160
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
161
+ _CCCL_DEVICE static inline void
162
+ st_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
163
+ {
164
+ // __space == space_global (due to parameter type constraint)
165
+ static_assert(sizeof(_B8) == 1, "");
166
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
167
+ asm("st.global.L2::cache_hint.b8 [%0], %1, %2;"
168
+ :
169
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
170
+ : "memory");
171
+ # else
172
+ // Unsupported architectures will have a linker error with a semi-decent error message
173
+ __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
174
+ # endif
175
+ }
176
+ #endif // __cccl_ptx_isa >= 740
177
+
178
+ /*
179
+ // st.space.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
180
+ // .space = { .global }
181
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
182
+ __device__ static inline void st_L2_cache_hint(
183
+ cuda::ptx::space_global_t,
184
+ B16* addr,
185
+ B16 src,
186
+ uint64_t cache_policy);
187
+ */
188
+ #if __cccl_ptx_isa >= 740
189
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
190
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
191
+ _CCCL_DEVICE static inline void
192
+ st_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
193
+ {
194
+ // __space == space_global (due to parameter type constraint)
195
+ static_assert(sizeof(_B16) == 2, "");
196
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
197
+ asm("st.global.L2::cache_hint.b16 [%0], %1, %2;"
198
+ :
199
+ : "l"(__as_ptr_gmem(__addr)),
200
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
201
+ "l"(__cache_policy)
202
+ : "memory");
203
+ # else
204
+ // Unsupported architectures will have a linker error with a semi-decent error message
205
+ __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
206
+ # endif
207
+ }
208
+ #endif // __cccl_ptx_isa >= 740
209
+
210
+ /*
211
+ // st.space.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
212
+ // .space = { .global }
213
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
214
+ __device__ static inline void st_L2_cache_hint(
215
+ cuda::ptx::space_global_t,
216
+ B32* addr,
217
+ B32 src,
218
+ uint64_t cache_policy);
219
+ */
220
+ #if __cccl_ptx_isa >= 740
221
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
222
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
223
+ _CCCL_DEVICE static inline void
224
+ st_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
225
+ {
226
+ // __space == space_global (due to parameter type constraint)
227
+ static_assert(sizeof(_B32) == 4, "");
228
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
229
+ asm("st.global.L2::cache_hint.b32 [%0], %1, %2;"
230
+ :
231
+ : "l"(__as_ptr_gmem(__addr)),
232
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
233
+ "l"(__cache_policy)
234
+ : "memory");
235
+ # else
236
+ // Unsupported architectures will have a linker error with a semi-decent error message
237
+ __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
238
+ # endif
239
+ }
240
+ #endif // __cccl_ptx_isa >= 740
241
+
242
+ /*
243
+ // st.space.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
244
+ // .space = { .global }
245
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
246
+ __device__ static inline void st_L2_cache_hint(
247
+ cuda::ptx::space_global_t,
248
+ B64* addr,
249
+ B64 src,
250
+ uint64_t cache_policy);
251
+ */
252
+ #if __cccl_ptx_isa >= 740
253
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
254
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
255
+ _CCCL_DEVICE static inline void
256
+ st_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
257
+ {
258
+ // __space == space_global (due to parameter type constraint)
259
+ static_assert(sizeof(_B64) == 8, "");
260
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
261
+ asm("st.global.L2::cache_hint.b64 [%0], %1, %2;"
262
+ :
263
+ : "l"(__as_ptr_gmem(__addr)),
264
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
265
+ "l"(__cache_policy)
266
+ : "memory");
267
+ # else
268
+ // Unsupported architectures will have a linker error with a semi-decent error message
269
+ __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
270
+ # endif
271
+ }
272
+ #endif // __cccl_ptx_isa >= 740
273
+
274
+ /*
275
+ // st.space.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
276
+ // .space = { .global }
277
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
278
+ __device__ static inline void st_L2_cache_hint(
279
+ cuda::ptx::space_global_t,
280
+ B128* addr,
281
+ B128 src,
282
+ uint64_t cache_policy);
283
+ */
284
+ #if __cccl_ptx_isa >= 830
285
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
286
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
287
+ _CCCL_DEVICE static inline void
288
+ st_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
289
+ {
290
+ // __space == space_global (due to parameter type constraint)
291
+ static_assert(sizeof(_B128) == 16, "");
292
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
293
+ asm("{\n\t .reg .b128 B128_src; \n\t"
294
+ "mov.b128 B128_src, {%1, %2}; \n"
295
+ "st.global.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
296
+ "}"
297
+ :
298
+ : "l"(__as_ptr_gmem(__addr)),
299
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
300
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
301
+ "l"(__cache_policy)
302
+ : "memory");
303
+ # else
304
+ // Unsupported architectures will have a linker error with a semi-decent error message
305
+ __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
306
+ # endif
307
+ }
308
+ #endif // __cccl_ptx_isa >= 830
309
+
310
+ /*
311
+ // st.space.L1::evict_normal.b8 [addr], src; // PTX ISA 74, SM_70
312
+ // .space = { .global }
313
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
314
+ __device__ static inline void st_L1_evict_normal(
315
+ cuda::ptx::space_global_t,
316
+ B8* addr,
317
+ B8 src);
318
+ */
319
+ #if __cccl_ptx_isa >= 740
320
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
321
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
322
+ _CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B8* __addr, _B8 __src)
323
+ {
324
+ // __space == space_global (due to parameter type constraint)
325
+ static_assert(sizeof(_B8) == 1, "");
326
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
327
+ asm("st.global.L1::evict_normal.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
328
+ # else
329
+ // Unsupported architectures will have a linker error with a semi-decent error message
330
+ __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
331
+ # endif
332
+ }
333
+ #endif // __cccl_ptx_isa >= 740
334
+
335
+ /*
336
+ // st.space.L1::evict_normal.b16 [addr], src; // PTX ISA 74, SM_70
337
+ // .space = { .global }
338
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
339
+ __device__ static inline void st_L1_evict_normal(
340
+ cuda::ptx::space_global_t,
341
+ B16* addr,
342
+ B16 src);
343
+ */
344
+ #if __cccl_ptx_isa >= 740
345
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
346
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
347
+ _CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B16* __addr, _B16 __src)
348
+ {
349
+ // __space == space_global (due to parameter type constraint)
350
+ static_assert(sizeof(_B16) == 2, "");
351
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
352
+ asm("st.global.L1::evict_normal.b16 [%0], %1;"
353
+ :
354
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
355
+ : "memory");
356
+ # else
357
+ // Unsupported architectures will have a linker error with a semi-decent error message
358
+ __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
359
+ # endif
360
+ }
361
+ #endif // __cccl_ptx_isa >= 740
362
+
363
+ /*
364
+ // st.space.L1::evict_normal.b32 [addr], src; // PTX ISA 74, SM_70
365
+ // .space = { .global }
366
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
367
+ __device__ static inline void st_L1_evict_normal(
368
+ cuda::ptx::space_global_t,
369
+ B32* addr,
370
+ B32 src);
371
+ */
372
+ #if __cccl_ptx_isa >= 740
373
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
374
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
375
+ _CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B32* __addr, _B32 __src)
376
+ {
377
+ // __space == space_global (due to parameter type constraint)
378
+ static_assert(sizeof(_B32) == 4, "");
379
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
380
+ asm("st.global.L1::evict_normal.b32 [%0], %1;"
381
+ :
382
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
383
+ : "memory");
384
+ # else
385
+ // Unsupported architectures will have a linker error with a semi-decent error message
386
+ __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
387
+ # endif
388
+ }
389
+ #endif // __cccl_ptx_isa >= 740
390
+
391
+ /*
392
+ // st.space.L1::evict_normal.b64 [addr], src; // PTX ISA 74, SM_70
393
+ // .space = { .global }
394
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
395
+ __device__ static inline void st_L1_evict_normal(
396
+ cuda::ptx::space_global_t,
397
+ B64* addr,
398
+ B64 src);
399
+ */
400
+ #if __cccl_ptx_isa >= 740
401
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
402
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
403
+ _CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B64* __addr, _B64 __src)
404
+ {
405
+ // __space == space_global (due to parameter type constraint)
406
+ static_assert(sizeof(_B64) == 8, "");
407
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
408
+ asm("st.global.L1::evict_normal.b64 [%0], %1;"
409
+ :
410
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
411
+ : "memory");
412
+ # else
413
+ // Unsupported architectures will have a linker error with a semi-decent error message
414
+ __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
415
+ # endif
416
+ }
417
+ #endif // __cccl_ptx_isa >= 740
418
+
419
+ /*
420
+ // st.space.L1::evict_normal.b128 [addr], src; // PTX ISA 83, SM_70
421
+ // .space = { .global }
422
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
423
+ __device__ static inline void st_L1_evict_normal(
424
+ cuda::ptx::space_global_t,
425
+ B128* addr,
426
+ B128 src);
427
+ */
428
+ #if __cccl_ptx_isa >= 830
429
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
430
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
431
+ _CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B128* __addr, _B128 __src)
432
+ {
433
+ // __space == space_global (due to parameter type constraint)
434
+ static_assert(sizeof(_B128) == 16, "");
435
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
436
+ asm("{\n\t .reg .b128 B128_src; \n\t"
437
+ "mov.b128 B128_src, {%1, %2}; \n"
438
+ "st.global.L1::evict_normal.b128 [%0], B128_src;\n\t"
439
+ "}"
440
+ :
441
+ : "l"(__as_ptr_gmem(__addr)),
442
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
443
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
444
+ : "memory");
445
+ # else
446
+ // Unsupported architectures will have a linker error with a semi-decent error message
447
+ __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
448
+ # endif
449
+ }
450
+ #endif // __cccl_ptx_isa >= 830
451
+
452
+ /*
453
+ // st.space.L1::evict_normal.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
454
+ // .space = { .global }
455
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
456
+ __device__ static inline void st_L1_evict_normal_L2_cache_hint(
457
+ cuda::ptx::space_global_t,
458
+ B8* addr,
459
+ B8 src,
460
+ uint64_t cache_policy);
461
+ */
462
+ #if __cccl_ptx_isa >= 740
463
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
464
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
465
+ _CCCL_DEVICE static inline void
466
+ st_L1_evict_normal_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
467
+ {
468
+ // __space == space_global (due to parameter type constraint)
469
+ static_assert(sizeof(_B8) == 1, "");
470
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
471
+ asm("st.global.L1::evict_normal.L2::cache_hint.b8 [%0], %1, %2;"
472
+ :
473
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
474
+ : "memory");
475
+ # else
476
+ // Unsupported architectures will have a linker error with a semi-decent error message
477
+ __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
478
+ # endif
479
+ }
480
+ #endif // __cccl_ptx_isa >= 740
481
+
482
+ /*
483
+ // st.space.L1::evict_normal.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
484
+ // .space = { .global }
485
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
486
+ __device__ static inline void st_L1_evict_normal_L2_cache_hint(
487
+ cuda::ptx::space_global_t,
488
+ B16* addr,
489
+ B16 src,
490
+ uint64_t cache_policy);
491
+ */
492
+ #if __cccl_ptx_isa >= 740
493
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
494
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
495
+ _CCCL_DEVICE static inline void
496
+ st_L1_evict_normal_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
497
+ {
498
+ // __space == space_global (due to parameter type constraint)
499
+ static_assert(sizeof(_B16) == 2, "");
500
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
501
+ asm("st.global.L1::evict_normal.L2::cache_hint.b16 [%0], %1, %2;"
502
+ :
503
+ : "l"(__as_ptr_gmem(__addr)),
504
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
505
+ "l"(__cache_policy)
506
+ : "memory");
507
+ # else
508
+ // Unsupported architectures will have a linker error with a semi-decent error message
509
+ __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
510
+ # endif
511
+ }
512
+ #endif // __cccl_ptx_isa >= 740
513
+
514
+ /*
515
+ // st.space.L1::evict_normal.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
516
+ // .space = { .global }
517
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
518
+ __device__ static inline void st_L1_evict_normal_L2_cache_hint(
519
+ cuda::ptx::space_global_t,
520
+ B32* addr,
521
+ B32 src,
522
+ uint64_t cache_policy);
523
+ */
524
+ #if __cccl_ptx_isa >= 740
525
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
526
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
527
+ _CCCL_DEVICE static inline void
528
+ st_L1_evict_normal_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
529
+ {
530
+ // __space == space_global (due to parameter type constraint)
531
+ static_assert(sizeof(_B32) == 4, "");
532
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
533
+ asm("st.global.L1::evict_normal.L2::cache_hint.b32 [%0], %1, %2;"
534
+ :
535
+ : "l"(__as_ptr_gmem(__addr)),
536
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
537
+ "l"(__cache_policy)
538
+ : "memory");
539
+ # else
540
+ // Unsupported architectures will have a linker error with a semi-decent error message
541
+ __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
542
+ # endif
543
+ }
544
+ #endif // __cccl_ptx_isa >= 740
545
+
546
+ /*
547
+ // st.space.L1::evict_normal.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
548
+ // .space = { .global }
549
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
550
+ __device__ static inline void st_L1_evict_normal_L2_cache_hint(
551
+ cuda::ptx::space_global_t,
552
+ B64* addr,
553
+ B64 src,
554
+ uint64_t cache_policy);
555
+ */
556
+ #if __cccl_ptx_isa >= 740
557
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
558
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
559
+ _CCCL_DEVICE static inline void
560
+ st_L1_evict_normal_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
561
+ {
562
+ // __space == space_global (due to parameter type constraint)
563
+ static_assert(sizeof(_B64) == 8, "");
564
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
565
+ asm("st.global.L1::evict_normal.L2::cache_hint.b64 [%0], %1, %2;"
566
+ :
567
+ : "l"(__as_ptr_gmem(__addr)),
568
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
569
+ "l"(__cache_policy)
570
+ : "memory");
571
+ # else
572
+ // Unsupported architectures will have a linker error with a semi-decent error message
573
+ __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
574
+ # endif
575
+ }
576
+ #endif // __cccl_ptx_isa >= 740
577
+
578
+ /*
579
+ // st.space.L1::evict_normal.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
580
+ // .space = { .global }
581
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
582
+ __device__ static inline void st_L1_evict_normal_L2_cache_hint(
583
+ cuda::ptx::space_global_t,
584
+ B128* addr,
585
+ B128 src,
586
+ uint64_t cache_policy);
587
+ */
588
+ #if __cccl_ptx_isa >= 830
589
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
590
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
591
+ _CCCL_DEVICE static inline void
592
+ st_L1_evict_normal_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
593
+ {
594
+ // __space == space_global (due to parameter type constraint)
595
+ static_assert(sizeof(_B128) == 16, "");
596
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
597
+ asm("{\n\t .reg .b128 B128_src; \n\t"
598
+ "mov.b128 B128_src, {%1, %2}; \n"
599
+ "st.global.L1::evict_normal.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
600
+ "}"
601
+ :
602
+ : "l"(__as_ptr_gmem(__addr)),
603
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
604
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
605
+ "l"(__cache_policy)
606
+ : "memory");
607
+ # else
608
+ // Unsupported architectures will have a linker error with a semi-decent error message
609
+ __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
610
+ # endif
611
+ }
612
+ #endif // __cccl_ptx_isa >= 830
613
+
614
+ /*
615
+ // st.space.L1::evict_unchanged.b8 [addr], src; // PTX ISA 74, SM_70
616
+ // .space = { .global }
617
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
618
+ __device__ static inline void st_L1_evict_unchanged(
619
+ cuda::ptx::space_global_t,
620
+ B8* addr,
621
+ B8 src);
622
+ */
623
+ #if __cccl_ptx_isa >= 740
624
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
625
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
626
+ _CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B8* __addr, _B8 __src)
627
+ {
628
+ // __space == space_global (due to parameter type constraint)
629
+ static_assert(sizeof(_B8) == 1, "");
630
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
631
+ asm("st.global.L1::evict_unchanged.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
632
+ # else
633
+ // Unsupported architectures will have a linker error with a semi-decent error message
634
+ __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
635
+ # endif
636
+ }
637
+ #endif // __cccl_ptx_isa >= 740
638
+
639
+ /*
640
+ // st.space.L1::evict_unchanged.b16 [addr], src; // PTX ISA 74, SM_70
641
+ // .space = { .global }
642
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
643
+ __device__ static inline void st_L1_evict_unchanged(
644
+ cuda::ptx::space_global_t,
645
+ B16* addr,
646
+ B16 src);
647
+ */
648
+ #if __cccl_ptx_isa >= 740
649
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
650
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
651
+ _CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B16* __addr, _B16 __src)
652
+ {
653
+ // __space == space_global (due to parameter type constraint)
654
+ static_assert(sizeof(_B16) == 2, "");
655
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
656
+ asm("st.global.L1::evict_unchanged.b16 [%0], %1;"
657
+ :
658
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
659
+ : "memory");
660
+ # else
661
+ // Unsupported architectures will have a linker error with a semi-decent error message
662
+ __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
663
+ # endif
664
+ }
665
+ #endif // __cccl_ptx_isa >= 740
666
+
667
+ /*
668
+ // st.space.L1::evict_unchanged.b32 [addr], src; // PTX ISA 74, SM_70
669
+ // .space = { .global }
670
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
671
+ __device__ static inline void st_L1_evict_unchanged(
672
+ cuda::ptx::space_global_t,
673
+ B32* addr,
674
+ B32 src);
675
+ */
676
+ #if __cccl_ptx_isa >= 740
677
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
678
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
679
+ _CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B32* __addr, _B32 __src)
680
+ {
681
+ // __space == space_global (due to parameter type constraint)
682
+ static_assert(sizeof(_B32) == 4, "");
683
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
684
+ asm("st.global.L1::evict_unchanged.b32 [%0], %1;"
685
+ :
686
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
687
+ : "memory");
688
+ # else
689
+ // Unsupported architectures will have a linker error with a semi-decent error message
690
+ __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
691
+ # endif
692
+ }
693
+ #endif // __cccl_ptx_isa >= 740
694
+
695
+ /*
696
+ // st.space.L1::evict_unchanged.b64 [addr], src; // PTX ISA 74, SM_70
697
+ // .space = { .global }
698
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
699
+ __device__ static inline void st_L1_evict_unchanged(
700
+ cuda::ptx::space_global_t,
701
+ B64* addr,
702
+ B64 src);
703
+ */
704
+ #if __cccl_ptx_isa >= 740
705
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
706
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
707
+ _CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B64* __addr, _B64 __src)
708
+ {
709
+ // __space == space_global (due to parameter type constraint)
710
+ static_assert(sizeof(_B64) == 8, "");
711
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
712
+ asm("st.global.L1::evict_unchanged.b64 [%0], %1;"
713
+ :
714
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
715
+ : "memory");
716
+ # else
717
+ // Unsupported architectures will have a linker error with a semi-decent error message
718
+ __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
719
+ # endif
720
+ }
721
+ #endif // __cccl_ptx_isa >= 740
722
+
723
+ /*
724
+ // st.space.L1::evict_unchanged.b128 [addr], src; // PTX ISA 83, SM_70
725
+ // .space = { .global }
726
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
727
+ __device__ static inline void st_L1_evict_unchanged(
728
+ cuda::ptx::space_global_t,
729
+ B128* addr,
730
+ B128 src);
731
+ */
732
+ #if __cccl_ptx_isa >= 830
733
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
734
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
735
+ _CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B128* __addr, _B128 __src)
736
+ {
737
+ // __space == space_global (due to parameter type constraint)
738
+ static_assert(sizeof(_B128) == 16, "");
739
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
740
+ asm("{\n\t .reg .b128 B128_src; \n\t"
741
+ "mov.b128 B128_src, {%1, %2}; \n"
742
+ "st.global.L1::evict_unchanged.b128 [%0], B128_src;\n\t"
743
+ "}"
744
+ :
745
+ : "l"(__as_ptr_gmem(__addr)),
746
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
747
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
748
+ : "memory");
749
+ # else
750
+ // Unsupported architectures will have a linker error with a semi-decent error message
751
+ __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
752
+ # endif
753
+ }
754
+ #endif // __cccl_ptx_isa >= 830
755
+
756
+ /*
757
+ // st.space.L1::evict_unchanged.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
758
+ // .space = { .global }
759
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
760
+ __device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
761
+ cuda::ptx::space_global_t,
762
+ B8* addr,
763
+ B8 src,
764
+ uint64_t cache_policy);
765
+ */
766
+ #if __cccl_ptx_isa >= 740
767
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
768
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
769
+ _CCCL_DEVICE static inline void
770
+ st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
771
+ {
772
+ // __space == space_global (due to parameter type constraint)
773
+ static_assert(sizeof(_B8) == 1, "");
774
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
775
+ asm("st.global.L1::evict_unchanged.L2::cache_hint.b8 [%0], %1, %2;"
776
+ :
777
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
778
+ : "memory");
779
+ # else
780
+ // Unsupported architectures will have a linker error with a semi-decent error message
781
+ __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
782
+ # endif
783
+ }
784
+ #endif // __cccl_ptx_isa >= 740
785
+
786
+ /*
787
+ // st.space.L1::evict_unchanged.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
788
+ // .space = { .global }
789
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
790
+ __device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
791
+ cuda::ptx::space_global_t,
792
+ B16* addr,
793
+ B16 src,
794
+ uint64_t cache_policy);
795
+ */
796
+ #if __cccl_ptx_isa >= 740
797
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
798
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
799
+ _CCCL_DEVICE static inline void
800
+ st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
801
+ {
802
+ // __space == space_global (due to parameter type constraint)
803
+ static_assert(sizeof(_B16) == 2, "");
804
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
805
+ asm("st.global.L1::evict_unchanged.L2::cache_hint.b16 [%0], %1, %2;"
806
+ :
807
+ : "l"(__as_ptr_gmem(__addr)),
808
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
809
+ "l"(__cache_policy)
810
+ : "memory");
811
+ # else
812
+ // Unsupported architectures will have a linker error with a semi-decent error message
813
+ __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
814
+ # endif
815
+ }
816
+ #endif // __cccl_ptx_isa >= 740
817
+
818
+ /*
819
+ // st.space.L1::evict_unchanged.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
820
+ // .space = { .global }
821
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
822
+ __device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
823
+ cuda::ptx::space_global_t,
824
+ B32* addr,
825
+ B32 src,
826
+ uint64_t cache_policy);
827
+ */
828
+ #if __cccl_ptx_isa >= 740
829
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
830
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
831
+ _CCCL_DEVICE static inline void
832
+ st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
833
+ {
834
+ // __space == space_global (due to parameter type constraint)
835
+ static_assert(sizeof(_B32) == 4, "");
836
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
837
+ asm("st.global.L1::evict_unchanged.L2::cache_hint.b32 [%0], %1, %2;"
838
+ :
839
+ : "l"(__as_ptr_gmem(__addr)),
840
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
841
+ "l"(__cache_policy)
842
+ : "memory");
843
+ # else
844
+ // Unsupported architectures will have a linker error with a semi-decent error message
845
+ __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
846
+ # endif
847
+ }
848
+ #endif // __cccl_ptx_isa >= 740
849
+
850
+ /*
851
+ // st.space.L1::evict_unchanged.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
852
+ // .space = { .global }
853
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
854
+ __device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
855
+ cuda::ptx::space_global_t,
856
+ B64* addr,
857
+ B64 src,
858
+ uint64_t cache_policy);
859
+ */
860
+ #if __cccl_ptx_isa >= 740
861
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
862
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
863
+ _CCCL_DEVICE static inline void
864
+ st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
865
+ {
866
+ // __space == space_global (due to parameter type constraint)
867
+ static_assert(sizeof(_B64) == 8, "");
868
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
869
+ asm("st.global.L1::evict_unchanged.L2::cache_hint.b64 [%0], %1, %2;"
870
+ :
871
+ : "l"(__as_ptr_gmem(__addr)),
872
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
873
+ "l"(__cache_policy)
874
+ : "memory");
875
+ # else
876
+ // Unsupported architectures will have a linker error with a semi-decent error message
877
+ __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
878
+ # endif
879
+ }
880
+ #endif // __cccl_ptx_isa >= 740
881
+
882
+ /*
883
+ // st.space.L1::evict_unchanged.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
884
+ // .space = { .global }
885
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
886
+ __device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
887
+ cuda::ptx::space_global_t,
888
+ B128* addr,
889
+ B128 src,
890
+ uint64_t cache_policy);
891
+ */
892
+ #if __cccl_ptx_isa >= 830
893
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
894
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
895
+ _CCCL_DEVICE static inline void
896
+ st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
897
+ {
898
+ // __space == space_global (due to parameter type constraint)
899
+ static_assert(sizeof(_B128) == 16, "");
900
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
901
+ asm("{\n\t .reg .b128 B128_src; \n\t"
902
+ "mov.b128 B128_src, {%1, %2}; \n"
903
+ "st.global.L1::evict_unchanged.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
904
+ "}"
905
+ :
906
+ : "l"(__as_ptr_gmem(__addr)),
907
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
908
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
909
+ "l"(__cache_policy)
910
+ : "memory");
911
+ # else
912
+ // Unsupported architectures will have a linker error with a semi-decent error message
913
+ __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
914
+ # endif
915
+ }
916
+ #endif // __cccl_ptx_isa >= 830
917
+
918
+ /*
919
+ // st.space.L1::evict_first.b8 [addr], src; // PTX ISA 74, SM_70
920
+ // .space = { .global }
921
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
922
+ __device__ static inline void st_L1_evict_first(
923
+ cuda::ptx::space_global_t,
924
+ B8* addr,
925
+ B8 src);
926
+ */
927
+ #if __cccl_ptx_isa >= 740
928
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
929
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
930
+ _CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B8* __addr, _B8 __src)
931
+ {
932
+ // __space == space_global (due to parameter type constraint)
933
+ static_assert(sizeof(_B8) == 1, "");
934
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
935
+ asm("st.global.L1::evict_first.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
936
+ # else
937
+ // Unsupported architectures will have a linker error with a semi-decent error message
938
+ __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
939
+ # endif
940
+ }
941
+ #endif // __cccl_ptx_isa >= 740
942
+
943
+ /*
944
+ // st.space.L1::evict_first.b16 [addr], src; // PTX ISA 74, SM_70
945
+ // .space = { .global }
946
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
947
+ __device__ static inline void st_L1_evict_first(
948
+ cuda::ptx::space_global_t,
949
+ B16* addr,
950
+ B16 src);
951
+ */
952
+ #if __cccl_ptx_isa >= 740
953
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
954
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
955
+ _CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B16* __addr, _B16 __src)
956
+ {
957
+ // __space == space_global (due to parameter type constraint)
958
+ static_assert(sizeof(_B16) == 2, "");
959
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
960
+ asm("st.global.L1::evict_first.b16 [%0], %1;"
961
+ :
962
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
963
+ : "memory");
964
+ # else
965
+ // Unsupported architectures will have a linker error with a semi-decent error message
966
+ __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
967
+ # endif
968
+ }
969
+ #endif // __cccl_ptx_isa >= 740
970
+
971
+ /*
972
+ // st.space.L1::evict_first.b32 [addr], src; // PTX ISA 74, SM_70
973
+ // .space = { .global }
974
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
975
+ __device__ static inline void st_L1_evict_first(
976
+ cuda::ptx::space_global_t,
977
+ B32* addr,
978
+ B32 src);
979
+ */
980
+ #if __cccl_ptx_isa >= 740
981
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
982
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
983
+ _CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B32* __addr, _B32 __src)
984
+ {
985
+ // __space == space_global (due to parameter type constraint)
986
+ static_assert(sizeof(_B32) == 4, "");
987
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
988
+ asm("st.global.L1::evict_first.b32 [%0], %1;"
989
+ :
990
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
991
+ : "memory");
992
+ # else
993
+ // Unsupported architectures will have a linker error with a semi-decent error message
994
+ __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
995
+ # endif
996
+ }
997
+ #endif // __cccl_ptx_isa >= 740
998
+
999
+ /*
1000
+ // st.space.L1::evict_first.b64 [addr], src; // PTX ISA 74, SM_70
1001
+ // .space = { .global }
1002
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1003
+ __device__ static inline void st_L1_evict_first(
1004
+ cuda::ptx::space_global_t,
1005
+ B64* addr,
1006
+ B64 src);
1007
+ */
1008
+ #if __cccl_ptx_isa >= 740
1009
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
1010
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1011
+ _CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B64* __addr, _B64 __src)
1012
+ {
1013
+ // __space == space_global (due to parameter type constraint)
1014
+ static_assert(sizeof(_B64) == 8, "");
1015
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1016
+ asm("st.global.L1::evict_first.b64 [%0], %1;"
1017
+ :
1018
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
1019
+ : "memory");
1020
+ # else
1021
+ // Unsupported architectures will have a linker error with a semi-decent error message
1022
+ __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
1023
+ # endif
1024
+ }
1025
+ #endif // __cccl_ptx_isa >= 740
1026
+
1027
+ /*
1028
+ // st.space.L1::evict_first.b128 [addr], src; // PTX ISA 83, SM_70
1029
+ // .space = { .global }
1030
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1031
+ __device__ static inline void st_L1_evict_first(
1032
+ cuda::ptx::space_global_t,
1033
+ B128* addr,
1034
+ B128 src);
1035
+ */
1036
+ #if __cccl_ptx_isa >= 830
1037
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
1038
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1039
+ _CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B128* __addr, _B128 __src)
1040
+ {
1041
+ // __space == space_global (due to parameter type constraint)
1042
+ static_assert(sizeof(_B128) == 16, "");
1043
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1044
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1045
+ "mov.b128 B128_src, {%1, %2}; \n"
1046
+ "st.global.L1::evict_first.b128 [%0], B128_src;\n\t"
1047
+ "}"
1048
+ :
1049
+ : "l"(__as_ptr_gmem(__addr)),
1050
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1051
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
1052
+ : "memory");
1053
+ # else
1054
+ // Unsupported architectures will have a linker error with a semi-decent error message
1055
+ __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
1056
+ # endif
1057
+ }
1058
+ #endif // __cccl_ptx_isa >= 830
1059
+
1060
+ /*
1061
+ // st.space.L1::evict_first.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
1062
+ // .space = { .global }
1063
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
1064
+ __device__ static inline void st_L1_evict_first_L2_cache_hint(
1065
+ cuda::ptx::space_global_t,
1066
+ B8* addr,
1067
+ B8 src,
1068
+ uint64_t cache_policy);
1069
+ */
1070
+ #if __cccl_ptx_isa >= 740
1071
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1072
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
1073
+ _CCCL_DEVICE static inline void
1074
+ st_L1_evict_first_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
1075
+ {
1076
+ // __space == space_global (due to parameter type constraint)
1077
+ static_assert(sizeof(_B8) == 1, "");
1078
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1079
+ asm("st.global.L1::evict_first.L2::cache_hint.b8 [%0], %1, %2;"
1080
+ :
1081
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
1082
+ : "memory");
1083
+ # else
1084
+ // Unsupported architectures will have a linker error with a semi-decent error message
1085
+ __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1086
+ # endif
1087
+ }
1088
+ #endif // __cccl_ptx_isa >= 740
1089
+
1090
+ /*
1091
+ // st.space.L1::evict_first.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
1092
+ // .space = { .global }
1093
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
1094
+ __device__ static inline void st_L1_evict_first_L2_cache_hint(
1095
+ cuda::ptx::space_global_t,
1096
+ B16* addr,
1097
+ B16 src,
1098
+ uint64_t cache_policy);
1099
+ */
1100
+ #if __cccl_ptx_isa >= 740
1101
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1102
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
1103
+ _CCCL_DEVICE static inline void
1104
+ st_L1_evict_first_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
1105
+ {
1106
+ // __space == space_global (due to parameter type constraint)
1107
+ static_assert(sizeof(_B16) == 2, "");
1108
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1109
+ asm("st.global.L1::evict_first.L2::cache_hint.b16 [%0], %1, %2;"
1110
+ :
1111
+ : "l"(__as_ptr_gmem(__addr)),
1112
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
1113
+ "l"(__cache_policy)
1114
+ : "memory");
1115
+ # else
1116
+ // Unsupported architectures will have a linker error with a semi-decent error message
1117
+ __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1118
+ # endif
1119
+ }
1120
+ #endif // __cccl_ptx_isa >= 740
1121
+
1122
+ /*
1123
+ // st.space.L1::evict_first.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
1124
+ // .space = { .global }
1125
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1126
+ __device__ static inline void st_L1_evict_first_L2_cache_hint(
1127
+ cuda::ptx::space_global_t,
1128
+ B32* addr,
1129
+ B32 src,
1130
+ uint64_t cache_policy);
1131
+ */
1132
+ #if __cccl_ptx_isa >= 740
1133
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1134
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1135
+ _CCCL_DEVICE static inline void
1136
+ st_L1_evict_first_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
1137
+ {
1138
+ // __space == space_global (due to parameter type constraint)
1139
+ static_assert(sizeof(_B32) == 4, "");
1140
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1141
+ asm("st.global.L1::evict_first.L2::cache_hint.b32 [%0], %1, %2;"
1142
+ :
1143
+ : "l"(__as_ptr_gmem(__addr)),
1144
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
1145
+ "l"(__cache_policy)
1146
+ : "memory");
1147
+ # else
1148
+ // Unsupported architectures will have a linker error with a semi-decent error message
1149
+ __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1150
+ # endif
1151
+ }
1152
+ #endif // __cccl_ptx_isa >= 740
1153
+
1154
+ /*
1155
+ // st.space.L1::evict_first.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
1156
+ // .space = { .global }
1157
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1158
+ __device__ static inline void st_L1_evict_first_L2_cache_hint(
1159
+ cuda::ptx::space_global_t,
1160
+ B64* addr,
1161
+ B64 src,
1162
+ uint64_t cache_policy);
1163
+ */
1164
+ #if __cccl_ptx_isa >= 740
1165
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1166
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1167
+ _CCCL_DEVICE static inline void
1168
+ st_L1_evict_first_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
1169
+ {
1170
+ // __space == space_global (due to parameter type constraint)
1171
+ static_assert(sizeof(_B64) == 8, "");
1172
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1173
+ asm("st.global.L1::evict_first.L2::cache_hint.b64 [%0], %1, %2;"
1174
+ :
1175
+ : "l"(__as_ptr_gmem(__addr)),
1176
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
1177
+ "l"(__cache_policy)
1178
+ : "memory");
1179
+ # else
1180
+ // Unsupported architectures will have a linker error with a semi-decent error message
1181
+ __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1182
+ # endif
1183
+ }
1184
+ #endif // __cccl_ptx_isa >= 740
1185
+
1186
+ /*
1187
+ // st.space.L1::evict_first.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
1188
+ // .space = { .global }
1189
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1190
+ __device__ static inline void st_L1_evict_first_L2_cache_hint(
1191
+ cuda::ptx::space_global_t,
1192
+ B128* addr,
1193
+ B128 src,
1194
+ uint64_t cache_policy);
1195
+ */
1196
+ #if __cccl_ptx_isa >= 830
1197
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1198
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1199
+ _CCCL_DEVICE static inline void
1200
+ st_L1_evict_first_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
1201
+ {
1202
+ // __space == space_global (due to parameter type constraint)
1203
+ static_assert(sizeof(_B128) == 16, "");
1204
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1205
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1206
+ "mov.b128 B128_src, {%1, %2}; \n"
1207
+ "st.global.L1::evict_first.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
1208
+ "}"
1209
+ :
1210
+ : "l"(__as_ptr_gmem(__addr)),
1211
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1212
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
1213
+ "l"(__cache_policy)
1214
+ : "memory");
1215
+ # else
1216
+ // Unsupported architectures will have a linker error with a semi-decent error message
1217
+ __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
1218
+ # endif
1219
+ }
1220
+ #endif // __cccl_ptx_isa >= 830
1221
+
1222
+ /*
1223
+ // st.space.L1::evict_last.b8 [addr], src; // PTX ISA 74, SM_70
1224
+ // .space = { .global }
1225
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
1226
+ __device__ static inline void st_L1_evict_last(
1227
+ cuda::ptx::space_global_t,
1228
+ B8* addr,
1229
+ B8 src);
1230
+ */
1231
+ #if __cccl_ptx_isa >= 740
1232
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1233
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
1234
+ _CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B8* __addr, _B8 __src)
1235
+ {
1236
+ // __space == space_global (due to parameter type constraint)
1237
+ static_assert(sizeof(_B8) == 1, "");
1238
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1239
+ asm("st.global.L1::evict_last.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
1240
+ # else
1241
+ // Unsupported architectures will have a linker error with a semi-decent error message
1242
+ __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1243
+ # endif
1244
+ }
1245
+ #endif // __cccl_ptx_isa >= 740
1246
+
1247
+ /*
1248
+ // st.space.L1::evict_last.b16 [addr], src; // PTX ISA 74, SM_70
1249
+ // .space = { .global }
1250
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
1251
+ __device__ static inline void st_L1_evict_last(
1252
+ cuda::ptx::space_global_t,
1253
+ B16* addr,
1254
+ B16 src);
1255
+ */
1256
+ #if __cccl_ptx_isa >= 740
1257
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1258
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
1259
+ _CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B16* __addr, _B16 __src)
1260
+ {
1261
+ // __space == space_global (due to parameter type constraint)
1262
+ static_assert(sizeof(_B16) == 2, "");
1263
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1264
+ asm("st.global.L1::evict_last.b16 [%0], %1;"
1265
+ :
1266
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
1267
+ : "memory");
1268
+ # else
1269
+ // Unsupported architectures will have a linker error with a semi-decent error message
1270
+ __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1271
+ # endif
1272
+ }
1273
+ #endif // __cccl_ptx_isa >= 740
1274
+
1275
+ /*
1276
+ // st.space.L1::evict_last.b32 [addr], src; // PTX ISA 74, SM_70
1277
+ // .space = { .global }
1278
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1279
+ __device__ static inline void st_L1_evict_last(
1280
+ cuda::ptx::space_global_t,
1281
+ B32* addr,
1282
+ B32 src);
1283
+ */
1284
+ #if __cccl_ptx_isa >= 740
1285
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1286
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1287
+ _CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B32* __addr, _B32 __src)
1288
+ {
1289
+ // __space == space_global (due to parameter type constraint)
1290
+ static_assert(sizeof(_B32) == 4, "");
1291
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1292
+ asm("st.global.L1::evict_last.b32 [%0], %1;"
1293
+ :
1294
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
1295
+ : "memory");
1296
+ # else
1297
+ // Unsupported architectures will have a linker error with a semi-decent error message
1298
+ __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1299
+ # endif
1300
+ }
1301
+ #endif // __cccl_ptx_isa >= 740
1302
+
1303
+ /*
1304
+ // st.space.L1::evict_last.b64 [addr], src; // PTX ISA 74, SM_70
1305
+ // .space = { .global }
1306
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1307
+ __device__ static inline void st_L1_evict_last(
1308
+ cuda::ptx::space_global_t,
1309
+ B64* addr,
1310
+ B64 src);
1311
+ */
1312
+ #if __cccl_ptx_isa >= 740
1313
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1314
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1315
+ _CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B64* __addr, _B64 __src)
1316
+ {
1317
+ // __space == space_global (due to parameter type constraint)
1318
+ static_assert(sizeof(_B64) == 8, "");
1319
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1320
+ asm("st.global.L1::evict_last.b64 [%0], %1;"
1321
+ :
1322
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
1323
+ : "memory");
1324
+ # else
1325
+ // Unsupported architectures will have a linker error with a semi-decent error message
1326
+ __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1327
+ # endif
1328
+ }
1329
+ #endif // __cccl_ptx_isa >= 740
1330
+
1331
+ /*
1332
+ // st.space.L1::evict_last.b128 [addr], src; // PTX ISA 83, SM_70
1333
+ // .space = { .global }
1334
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1335
+ __device__ static inline void st_L1_evict_last(
1336
+ cuda::ptx::space_global_t,
1337
+ B128* addr,
1338
+ B128 src);
1339
+ */
1340
+ #if __cccl_ptx_isa >= 830
1341
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1342
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1343
+ _CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B128* __addr, _B128 __src)
1344
+ {
1345
+ // __space == space_global (due to parameter type constraint)
1346
+ static_assert(sizeof(_B128) == 16, "");
1347
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1348
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1349
+ "mov.b128 B128_src, {%1, %2}; \n"
1350
+ "st.global.L1::evict_last.b128 [%0], B128_src;\n\t"
1351
+ "}"
1352
+ :
1353
+ : "l"(__as_ptr_gmem(__addr)),
1354
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1355
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
1356
+ : "memory");
1357
+ # else
1358
+ // Unsupported architectures will have a linker error with a semi-decent error message
1359
+ __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
1360
+ # endif
1361
+ }
1362
+ #endif // __cccl_ptx_isa >= 830
1363
+
1364
+ /*
1365
+ // st.space.L1::evict_last.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
1366
+ // .space = { .global }
1367
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
1368
+ __device__ static inline void st_L1_evict_last_L2_cache_hint(
1369
+ cuda::ptx::space_global_t,
1370
+ B8* addr,
1371
+ B8 src,
1372
+ uint64_t cache_policy);
1373
+ */
1374
+ #if __cccl_ptx_isa >= 740
1375
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1376
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
1377
+ _CCCL_DEVICE static inline void
1378
+ st_L1_evict_last_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
1379
+ {
1380
+ // __space == space_global (due to parameter type constraint)
1381
+ static_assert(sizeof(_B8) == 1, "");
1382
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1383
+ asm("st.global.L1::evict_last.L2::cache_hint.b8 [%0], %1, %2;"
1384
+ :
1385
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
1386
+ : "memory");
1387
+ # else
1388
+ // Unsupported architectures will have a linker error with a semi-decent error message
1389
+ __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1390
+ # endif
1391
+ }
1392
+ #endif // __cccl_ptx_isa >= 740
1393
+
1394
+ /*
1395
+ // st.space.L1::evict_last.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
1396
+ // .space = { .global }
1397
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
1398
+ __device__ static inline void st_L1_evict_last_L2_cache_hint(
1399
+ cuda::ptx::space_global_t,
1400
+ B16* addr,
1401
+ B16 src,
1402
+ uint64_t cache_policy);
1403
+ */
1404
+ #if __cccl_ptx_isa >= 740
1405
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1406
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
1407
+ _CCCL_DEVICE static inline void
1408
+ st_L1_evict_last_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
1409
+ {
1410
+ // __space == space_global (due to parameter type constraint)
1411
+ static_assert(sizeof(_B16) == 2, "");
1412
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1413
+ asm("st.global.L1::evict_last.L2::cache_hint.b16 [%0], %1, %2;"
1414
+ :
1415
+ : "l"(__as_ptr_gmem(__addr)),
1416
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
1417
+ "l"(__cache_policy)
1418
+ : "memory");
1419
+ # else
1420
+ // Unsupported architectures will have a linker error with a semi-decent error message
1421
+ __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1422
+ # endif
1423
+ }
1424
+ #endif // __cccl_ptx_isa >= 740
1425
+
1426
+ /*
1427
+ // st.space.L1::evict_last.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
1428
+ // .space = { .global }
1429
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1430
+ __device__ static inline void st_L1_evict_last_L2_cache_hint(
1431
+ cuda::ptx::space_global_t,
1432
+ B32* addr,
1433
+ B32 src,
1434
+ uint64_t cache_policy);
1435
+ */
1436
+ #if __cccl_ptx_isa >= 740
1437
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1438
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1439
+ _CCCL_DEVICE static inline void
1440
+ st_L1_evict_last_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
1441
+ {
1442
+ // __space == space_global (due to parameter type constraint)
1443
+ static_assert(sizeof(_B32) == 4, "");
1444
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1445
+ asm("st.global.L1::evict_last.L2::cache_hint.b32 [%0], %1, %2;"
1446
+ :
1447
+ : "l"(__as_ptr_gmem(__addr)),
1448
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
1449
+ "l"(__cache_policy)
1450
+ : "memory");
1451
+ # else
1452
+ // Unsupported architectures will have a linker error with a semi-decent error message
1453
+ __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1454
+ # endif
1455
+ }
1456
+ #endif // __cccl_ptx_isa >= 740
1457
+
1458
+ /*
1459
+ // st.space.L1::evict_last.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
1460
+ // .space = { .global }
1461
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1462
+ __device__ static inline void st_L1_evict_last_L2_cache_hint(
1463
+ cuda::ptx::space_global_t,
1464
+ B64* addr,
1465
+ B64 src,
1466
+ uint64_t cache_policy);
1467
+ */
1468
+ #if __cccl_ptx_isa >= 740
1469
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1470
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1471
+ _CCCL_DEVICE static inline void
1472
+ st_L1_evict_last_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
1473
+ {
1474
+ // __space == space_global (due to parameter type constraint)
1475
+ static_assert(sizeof(_B64) == 8, "");
1476
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1477
+ asm("st.global.L1::evict_last.L2::cache_hint.b64 [%0], %1, %2;"
1478
+ :
1479
+ : "l"(__as_ptr_gmem(__addr)),
1480
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
1481
+ "l"(__cache_policy)
1482
+ : "memory");
1483
+ # else
1484
+ // Unsupported architectures will have a linker error with a semi-decent error message
1485
+ __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1486
+ # endif
1487
+ }
1488
+ #endif // __cccl_ptx_isa >= 740
1489
+
1490
+ /*
1491
+ // st.space.L1::evict_last.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
1492
+ // .space = { .global }
1493
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1494
+ __device__ static inline void st_L1_evict_last_L2_cache_hint(
1495
+ cuda::ptx::space_global_t,
1496
+ B128* addr,
1497
+ B128 src,
1498
+ uint64_t cache_policy);
1499
+ */
1500
+ #if __cccl_ptx_isa >= 830
1501
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1502
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1503
+ _CCCL_DEVICE static inline void
1504
+ st_L1_evict_last_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
1505
+ {
1506
+ // __space == space_global (due to parameter type constraint)
1507
+ static_assert(sizeof(_B128) == 16, "");
1508
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1509
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1510
+ "mov.b128 B128_src, {%1, %2}; \n"
1511
+ "st.global.L1::evict_last.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
1512
+ "}"
1513
+ :
1514
+ : "l"(__as_ptr_gmem(__addr)),
1515
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1516
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
1517
+ "l"(__cache_policy)
1518
+ : "memory");
1519
+ # else
1520
+ // Unsupported architectures will have a linker error with a semi-decent error message
1521
+ __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
1522
+ # endif
1523
+ }
1524
+ #endif // __cccl_ptx_isa >= 830
1525
+
1526
+ /*
1527
+ // st.space.L1::no_allocate.b8 [addr], src; // PTX ISA 74, SM_70
1528
+ // .space = { .global }
1529
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
1530
+ __device__ static inline void st_L1_no_allocate(
1531
+ cuda::ptx::space_global_t,
1532
+ B8* addr,
1533
+ B8 src);
1534
+ */
1535
+ #if __cccl_ptx_isa >= 740
1536
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1537
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
1538
+ _CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B8* __addr, _B8 __src)
1539
+ {
1540
+ // __space == space_global (due to parameter type constraint)
1541
+ static_assert(sizeof(_B8) == 1, "");
1542
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1543
+ asm("st.global.L1::no_allocate.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
1544
+ # else
1545
+ // Unsupported architectures will have a linker error with a semi-decent error message
1546
+ __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1547
+ # endif
1548
+ }
1549
+ #endif // __cccl_ptx_isa >= 740
1550
+
1551
+ /*
1552
+ // st.space.L1::no_allocate.b16 [addr], src; // PTX ISA 74, SM_70
1553
+ // .space = { .global }
1554
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
1555
+ __device__ static inline void st_L1_no_allocate(
1556
+ cuda::ptx::space_global_t,
1557
+ B16* addr,
1558
+ B16 src);
1559
+ */
1560
+ #if __cccl_ptx_isa >= 740
1561
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1562
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
1563
+ _CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B16* __addr, _B16 __src)
1564
+ {
1565
+ // __space == space_global (due to parameter type constraint)
1566
+ static_assert(sizeof(_B16) == 2, "");
1567
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1568
+ asm("st.global.L1::no_allocate.b16 [%0], %1;"
1569
+ :
1570
+ : "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
1571
+ : "memory");
1572
+ # else
1573
+ // Unsupported architectures will have a linker error with a semi-decent error message
1574
+ __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1575
+ # endif
1576
+ }
1577
+ #endif // __cccl_ptx_isa >= 740
1578
+
1579
+ /*
1580
+ // st.space.L1::no_allocate.b32 [addr], src; // PTX ISA 74, SM_70
1581
+ // .space = { .global }
1582
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1583
+ __device__ static inline void st_L1_no_allocate(
1584
+ cuda::ptx::space_global_t,
1585
+ B32* addr,
1586
+ B32 src);
1587
+ */
1588
+ #if __cccl_ptx_isa >= 740
1589
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1590
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1591
+ _CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B32* __addr, _B32 __src)
1592
+ {
1593
+ // __space == space_global (due to parameter type constraint)
1594
+ static_assert(sizeof(_B32) == 4, "");
1595
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1596
+ asm("st.global.L1::no_allocate.b32 [%0], %1;"
1597
+ :
1598
+ : "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
1599
+ : "memory");
1600
+ # else
1601
+ // Unsupported architectures will have a linker error with a semi-decent error message
1602
+ __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1603
+ # endif
1604
+ }
1605
+ #endif // __cccl_ptx_isa >= 740
1606
+
1607
+ /*
1608
+ // st.space.L1::no_allocate.b64 [addr], src; // PTX ISA 74, SM_70
1609
+ // .space = { .global }
1610
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1611
+ __device__ static inline void st_L1_no_allocate(
1612
+ cuda::ptx::space_global_t,
1613
+ B64* addr,
1614
+ B64 src);
1615
+ */
1616
+ #if __cccl_ptx_isa >= 740
1617
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1618
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1619
+ _CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B64* __addr, _B64 __src)
1620
+ {
1621
+ // __space == space_global (due to parameter type constraint)
1622
+ static_assert(sizeof(_B64) == 8, "");
1623
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1624
+ asm("st.global.L1::no_allocate.b64 [%0], %1;"
1625
+ :
1626
+ : "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
1627
+ : "memory");
1628
+ # else
1629
+ // Unsupported architectures will have a linker error with a semi-decent error message
1630
+ __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1631
+ # endif
1632
+ }
1633
+ #endif // __cccl_ptx_isa >= 740
1634
+
1635
+ /*
1636
+ // st.space.L1::no_allocate.b128 [addr], src; // PTX ISA 83, SM_70
1637
+ // .space = { .global }
1638
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1639
+ __device__ static inline void st_L1_no_allocate(
1640
+ cuda::ptx::space_global_t,
1641
+ B128* addr,
1642
+ B128 src);
1643
+ */
1644
+ #if __cccl_ptx_isa >= 830
1645
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1646
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1647
+ _CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B128* __addr, _B128 __src)
1648
+ {
1649
+ // __space == space_global (due to parameter type constraint)
1650
+ static_assert(sizeof(_B128) == 16, "");
1651
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
1652
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1653
+ "mov.b128 B128_src, {%1, %2}; \n"
1654
+ "st.global.L1::no_allocate.b128 [%0], B128_src;\n\t"
1655
+ "}"
1656
+ :
1657
+ : "l"(__as_ptr_gmem(__addr)),
1658
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1659
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y)
1660
+ : "memory");
1661
+ # else
1662
+ // Unsupported architectures will have a linker error with a semi-decent error message
1663
+ __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
1664
+ # endif
1665
+ }
1666
+ #endif // __cccl_ptx_isa >= 830
1667
+
1668
+ /*
1669
+ // st.space.L1::no_allocate.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
1670
+ // .space = { .global }
1671
+ template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
1672
+ __device__ static inline void st_L1_no_allocate_L2_cache_hint(
1673
+ cuda::ptx::space_global_t,
1674
+ B8* addr,
1675
+ B8 src,
1676
+ uint64_t cache_policy);
1677
+ */
1678
+ #if __cccl_ptx_isa >= 740
1679
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1680
+ template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
1681
+ _CCCL_DEVICE static inline void
1682
+ st_L1_no_allocate_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
1683
+ {
1684
+ // __space == space_global (due to parameter type constraint)
1685
+ static_assert(sizeof(_B8) == 1, "");
1686
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1687
+ asm("st.global.L1::no_allocate.L2::cache_hint.b8 [%0], %1, %2;"
1688
+ :
1689
+ : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
1690
+ : "memory");
1691
+ # else
1692
+ // Unsupported architectures will have a linker error with a semi-decent error message
1693
+ __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1694
+ # endif
1695
+ }
1696
+ #endif // __cccl_ptx_isa >= 740
1697
+
1698
+ /*
1699
+ // st.space.L1::no_allocate.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
1700
+ // .space = { .global }
1701
+ template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
1702
+ __device__ static inline void st_L1_no_allocate_L2_cache_hint(
1703
+ cuda::ptx::space_global_t,
1704
+ B16* addr,
1705
+ B16 src,
1706
+ uint64_t cache_policy);
1707
+ */
1708
+ #if __cccl_ptx_isa >= 740
1709
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1710
+ template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
1711
+ _CCCL_DEVICE static inline void
1712
+ st_L1_no_allocate_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
1713
+ {
1714
+ // __space == space_global (due to parameter type constraint)
1715
+ static_assert(sizeof(_B16) == 2, "");
1716
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1717
+ asm("st.global.L1::no_allocate.L2::cache_hint.b16 [%0], %1, %2;"
1718
+ :
1719
+ : "l"(__as_ptr_gmem(__addr)),
1720
+ "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
1721
+ "l"(__cache_policy)
1722
+ : "memory");
1723
+ # else
1724
+ // Unsupported architectures will have a linker error with a semi-decent error message
1725
+ __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1726
+ # endif
1727
+ }
1728
+ #endif // __cccl_ptx_isa >= 740
1729
+
1730
+ /*
1731
+ // st.space.L1::no_allocate.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
1732
+ // .space = { .global }
1733
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1734
+ __device__ static inline void st_L1_no_allocate_L2_cache_hint(
1735
+ cuda::ptx::space_global_t,
1736
+ B32* addr,
1737
+ B32 src,
1738
+ uint64_t cache_policy);
1739
+ */
1740
+ #if __cccl_ptx_isa >= 740
1741
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1742
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1743
+ _CCCL_DEVICE static inline void
1744
+ st_L1_no_allocate_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
1745
+ {
1746
+ // __space == space_global (due to parameter type constraint)
1747
+ static_assert(sizeof(_B32) == 4, "");
1748
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1749
+ asm("st.global.L1::no_allocate.L2::cache_hint.b32 [%0], %1, %2;"
1750
+ :
1751
+ : "l"(__as_ptr_gmem(__addr)),
1752
+ "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
1753
+ "l"(__cache_policy)
1754
+ : "memory");
1755
+ # else
1756
+ // Unsupported architectures will have a linker error with a semi-decent error message
1757
+ __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1758
+ # endif
1759
+ }
1760
+ #endif // __cccl_ptx_isa >= 740
1761
+
1762
+ /*
1763
+ // st.space.L1::no_allocate.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
1764
+ // .space = { .global }
1765
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1766
+ __device__ static inline void st_L1_no_allocate_L2_cache_hint(
1767
+ cuda::ptx::space_global_t,
1768
+ B64* addr,
1769
+ B64 src,
1770
+ uint64_t cache_policy);
1771
+ */
1772
+ #if __cccl_ptx_isa >= 740
1773
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1774
+ template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
1775
+ _CCCL_DEVICE static inline void
1776
+ st_L1_no_allocate_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
1777
+ {
1778
+ // __space == space_global (due to parameter type constraint)
1779
+ static_assert(sizeof(_B64) == 8, "");
1780
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1781
+ asm("st.global.L1::no_allocate.L2::cache_hint.b64 [%0], %1, %2;"
1782
+ :
1783
+ : "l"(__as_ptr_gmem(__addr)),
1784
+ "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
1785
+ "l"(__cache_policy)
1786
+ : "memory");
1787
+ # else
1788
+ // Unsupported architectures will have a linker error with a semi-decent error message
1789
+ __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1790
+ # endif
1791
+ }
1792
+ #endif // __cccl_ptx_isa >= 740
1793
+
1794
+ /*
1795
+ // st.space.L1::no_allocate.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
1796
+ // .space = { .global }
1797
+ template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
1798
+ __device__ static inline void st_L1_no_allocate_L2_cache_hint(
1799
+ cuda::ptx::space_global_t,
1800
+ B128* addr,
1801
+ B128 src,
1802
+ uint64_t cache_policy);
1803
+ */
1804
+ #if __cccl_ptx_isa >= 830
1805
+ extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1806
+ template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
1807
+ _CCCL_DEVICE static inline void
1808
+ st_L1_no_allocate_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
1809
+ {
1810
+ // __space == space_global (due to parameter type constraint)
1811
+ static_assert(sizeof(_B128) == 16, "");
1812
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
1813
+ asm("{\n\t .reg .b128 B128_src; \n\t"
1814
+ "mov.b128 B128_src, {%1, %2}; \n"
1815
+ "st.global.L1::no_allocate.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
1816
+ "}"
1817
+ :
1818
+ : "l"(__as_ptr_gmem(__addr)),
1819
+ "l"((*reinterpret_cast<longlong2*>(&__src)).x),
1820
+ "l"((*reinterpret_cast<longlong2*>(&__src)).y),
1821
+ "l"(__cache_policy)
1822
+ : "memory");
1823
+ # else
1824
+ // Unsupported architectures will have a linker error with a semi-decent error message
1825
+ __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
1826
+ # endif
1827
+ }
1828
+ #endif // __cccl_ptx_isa >= 830
1829
+
1830
+ #endif // _CUDA_PTX_GENERATED_ST_H_