cuda-cccl 0.1.3.1.0.dev1678__cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1860) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +273 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +935 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
  241. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  242. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
  243. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
  244. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
  245. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  247. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  248. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  249. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  259. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  260. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  261. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  262. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  263. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  264. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  265. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  266. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  267. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  268. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
  269. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  270. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  271. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  272. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  273. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  274. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  275. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  276. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  384. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  385. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  386. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  387. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
  388. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  389. cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
  390. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  391. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  392. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  393. cuda/cccl/headers/include/cuda/access_property +26 -0
  394. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  395. cuda/cccl/headers/include/cuda/atomic +27 -0
  396. cuda/cccl/headers/include/cuda/barrier +262 -0
  397. cuda/cccl/headers/include/cuda/bit +29 -0
  398. cuda/cccl/headers/include/cuda/cmath +35 -0
  399. cuda/cccl/headers/include/cuda/discard_memory +60 -0
  400. cuda/cccl/headers/include/cuda/functional +31 -0
  401. cuda/cccl/headers/include/cuda/iterator +34 -0
  402. cuda/cccl/headers/include/cuda/latch +27 -0
  403. cuda/cccl/headers/include/cuda/mdspan +28 -0
  404. cuda/cccl/headers/include/cuda/memory +32 -0
  405. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  406. cuda/cccl/headers/include/cuda/numeric +28 -0
  407. cuda/cccl/headers/include/cuda/pipeline +577 -0
  408. cuda/cccl/headers/include/cuda/ptx +124 -0
  409. cuda/cccl/headers/include/cuda/semaphore +31 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  517. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  518. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  519. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  520. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  521. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  522. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  523. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  524. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  525. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  526. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  527. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  530. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  531. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  532. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  533. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  534. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  535. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  536. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
  537. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  555. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  556. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  557. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  558. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  559. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  560. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  561. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  562. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
  563. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  564. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  565. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  566. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  567. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
  583. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  584. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
  585. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  586. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  587. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  588. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  589. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  590. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  591. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  592. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  593. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  594. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  595. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  596. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
  597. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  598. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
  599. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  600. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  601. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
  602. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  603. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  604. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  605. cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
  606. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  607. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  608. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
  609. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  610. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  611. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  612. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  613. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  614. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  615. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
  616. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  617. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  618. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  619. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  620. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  621. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  622. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  623. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  624. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  625. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  627. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  628. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  629. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  630. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  631. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  632. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  633. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  634. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  635. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  636. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  637. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  638. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  639. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  640. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
  641. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  642. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  643. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  644. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
  645. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
  646. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  647. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
  648. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  650. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  651. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  652. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  653. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  654. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  655. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  656. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  657. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  660. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  661. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  662. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  663. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  664. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
  665. cuda/cccl/headers/include/cuda/std/__format_ +28 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  667. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  668. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  669. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  670. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  671. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  672. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  673. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  674. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  675. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  676. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  677. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  678. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
  679. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  680. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  681. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  682. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  683. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  684. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  685. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  686. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  687. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  688. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  689. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  690. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  691. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  692. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  693. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  694. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  696. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  697. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  698. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  699. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  700. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  701. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  702. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  703. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  704. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  705. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  706. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  707. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  708. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  709. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  710. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  711. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  712. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  713. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  724. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  725. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
  726. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  727. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  728. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
  729. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
  731. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  732. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  733. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  734. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  735. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  736. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
  737. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  739. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
  740. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  741. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  742. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  743. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  744. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  745. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  746. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
  747. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  748. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  749. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  751. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  752. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  753. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  754. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  755. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  756. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  757. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  758. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
  759. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
  760. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  761. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  762. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
  763. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  764. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
  765. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
  766. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  767. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
  768. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  769. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  770. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  771. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  772. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  773. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  774. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  775. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  776. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  777. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  778. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
  779. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  780. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  781. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
  782. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  783. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  784. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  785. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  786. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  787. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  788. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
  789. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
  790. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  791. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  792. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  793. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  794. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  795. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  796. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  797. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  798. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  799. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  800. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  801. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  802. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  803. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  804. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  805. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  806. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  807. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  808. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  809. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  810. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
  811. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  812. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  813. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  814. cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
  815. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
  816. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  817. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  818. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  819. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  820. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
  821. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  822. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  823. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  824. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  825. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  826. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  827. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  828. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  829. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  830. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  831. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  832. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  833. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  834. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  835. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  836. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  837. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
  839. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
  840. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  841. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
  842. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  843. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  844. cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
  845. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  846. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
  847. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  848. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  849. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  850. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  851. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  852. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  853. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  854. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  855. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  856. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  857. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  858. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  859. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  860. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  861. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  862. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  863. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  864. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
  865. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  866. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  867. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  868. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  869. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  870. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  871. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  872. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  873. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  874. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  875. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  876. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1016. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1017. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  1018. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1019. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1020. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1021. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1022. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1023. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1024. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1025. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1026. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1027. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1028. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1029. cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
  1030. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1031. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
  1032. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1034. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1035. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1036. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1037. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1038. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/array +520 -0
  1040. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1041. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  1042. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1043. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1044. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1045. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1046. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1047. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1048. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1049. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1050. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1051. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1052. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1053. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1054. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1055. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1056. cuda/cccl/headers/include/cuda/std/ctime +152 -0
  1057. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1058. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1059. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
  1060. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
  1061. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
  1062. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1063. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1064. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
  1065. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
  1066. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1067. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1068. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1069. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1070. cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
  1071. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1072. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1073. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1074. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1075. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1076. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1077. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1078. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1079. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1080. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1081. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1082. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1083. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1084. cuda/cccl/headers/include/cuda/std/span +640 -0
  1085. cuda/cccl/headers/include/cuda/std/string_view +788 -0
  1086. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1087. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1088. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1089. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1090. cuda/cccl/headers/include/cuda/std/version +245 -0
  1091. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1092. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1093. cuda/cccl/headers/include/cuda/utility +27 -0
  1094. cuda/cccl/headers/include/cuda/version +16 -0
  1095. cuda/cccl/headers/include/cuda/warp +28 -0
  1096. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1097. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1098. cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
  1099. cuda/cccl/headers/include/nv/target +240 -0
  1100. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1101. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1102. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1103. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1104. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1105. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1106. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1107. cuda/cccl/headers/include/thrust/count.h +245 -0
  1108. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1109. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1110. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1111. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1112. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1113. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1114. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1115. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1116. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1117. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1118. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1119. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1120. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1121. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1122. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1123. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1124. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1125. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1126. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1127. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1128. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1129. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1130. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1131. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1132. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1133. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1134. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1135. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1136. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1137. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1138. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1139. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1140. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1141. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1142. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1143. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1144. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1145. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1146. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1147. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1148. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1149. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1150. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1151. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1152. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1153. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1154. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1155. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1156. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1157. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1158. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1159. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1160. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1161. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1162. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1163. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1164. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1165. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1166. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1167. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1168. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1169. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1170. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1171. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1172. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1173. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1174. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1175. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1176. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1177. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1178. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1179. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1180. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1181. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1182. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1183. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1184. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1185. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1186. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1187. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1188. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1189. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1190. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1191. cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
  1192. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1193. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1194. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1195. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1196. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1197. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1198. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1199. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1200. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1201. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1202. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1203. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1204. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1205. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1206. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1207. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1208. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1209. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1210. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1211. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1212. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1213. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1214. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1215. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1216. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1217. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1218. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1219. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1220. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1221. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1222. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1223. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1224. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1225. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1226. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1227. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1228. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1229. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1230. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1231. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1232. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1233. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1234. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1235. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1236. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1237. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1238. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1239. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1240. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1241. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1242. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1243. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1244. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1245. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1246. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1247. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1248. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1249. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1250. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1251. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1252. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1254. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1255. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1256. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1257. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1258. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1259. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1260. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1261. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1262. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1263. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1264. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1265. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1266. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1267. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1268. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1269. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1270. cuda/cccl/headers/include/thrust/find.h +382 -0
  1271. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1272. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1273. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1274. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1275. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1276. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1277. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1278. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1279. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1280. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1281. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1282. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1283. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1284. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1285. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1286. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1287. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1288. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1289. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1290. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1291. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1292. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1293. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1294. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1295. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1296. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1297. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1298. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
  1299. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1300. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1301. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1302. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1303. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1304. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1305. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1306. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1307. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1308. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1309. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1310. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1311. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1312. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1313. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1314. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1315. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1316. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1317. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1318. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1319. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1320. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1321. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1322. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1323. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1324. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1325. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1326. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1327. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1328. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1329. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1330. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1331. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1332. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1333. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1334. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1335. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1336. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1337. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1338. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1339. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1340. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1341. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1342. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1343. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1344. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1345. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1346. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1347. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1348. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1349. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1350. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1351. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1352. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1353. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1354. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1355. cuda/cccl/headers/include/thrust/random.h +120 -0
  1356. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1357. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1358. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1359. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1360. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1361. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1362. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1363. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1364. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1365. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1366. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1377. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1378. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1379. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1380. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1382. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1383. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1384. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1385. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1386. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1388. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1389. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1390. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1391. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1392. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1393. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1394. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1395. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1396. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1397. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1398. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1399. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1400. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1401. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1402. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1403. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1404. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1405. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1406. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1408. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1409. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1410. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1411. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1412. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1413. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1414. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1415. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1416. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1417. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1418. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1446. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
  1447. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1448. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1449. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1450. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1451. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1452. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1453. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1454. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1455. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1456. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1457. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1458. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1459. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
  1460. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1461. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1462. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1463. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1464. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1465. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1466. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1467. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1469. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1470. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1471. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1472. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1473. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1474. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1475. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1476. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1477. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1478. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1479. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
  1480. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1481. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1482. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1483. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1484. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1485. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1486. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1487. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1675. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1702. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1703. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1704. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1705. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1706. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1708. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1709. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1710. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1711. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1712. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1713. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1714. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1715. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1716. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1717. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1718. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1724. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1725. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1726. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1727. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1728. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1730. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1731. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1732. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1733. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1734. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1735. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1736. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1737. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1738. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1739. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1740. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1742. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1767. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1768. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1769. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1770. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1771. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1772. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1773. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1775. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1776. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1777. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1778. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1779. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1780. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1783. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1784. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1785. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1786. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1788. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1789. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1790. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1807. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1808. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1809. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1810. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1811. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1812. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1813. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1814. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1815. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1816. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1817. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1818. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1819. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1820. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1821. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1822. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1823. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1824. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1825. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1826. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1827. cuda/cccl/headers/include/thrust/version.h +93 -0
  1828. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1829. cuda/cccl/headers/include_paths.py +72 -0
  1830. cuda/cccl/parallel/__init__.py +9 -0
  1831. cuda/cccl/parallel/experimental/__init__.py +47 -0
  1832. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1833. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1834. cuda/cccl/parallel/experimental/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
  1835. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1836. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1837. cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
  1838. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1839. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1840. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1841. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1842. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1843. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1844. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1845. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1846. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1847. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1848. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1849. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1850. cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
  1851. cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
  1852. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1853. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1854. cuda/cccl/parallel/experimental/struct.py +150 -0
  1855. cuda/cccl/parallel/experimental/typing.py +27 -0
  1856. cuda/cccl/py.typed +0 -0
  1857. cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
  1858. cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
  1859. cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
  1860. cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1246 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! Operations for writing linear segments of data from the CUDA thread block
31
+
32
+ #pragma once
33
+
34
+ #include <cub/config.cuh>
35
+
36
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
37
+ # pragma GCC system_header
38
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
39
+ # pragma clang system_header
40
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
41
+ # pragma system_header
42
+ #endif // no system header
43
+
44
+ #include <cub/block/block_exchange.cuh>
45
+ #include <cub/util_ptx.cuh>
46
+ #include <cub/util_type.cuh>
47
+
48
+ CUB_NAMESPACE_BEGIN
49
+
50
+ //! @name Blocked arrangement I/O (direct)
51
+ //! @{
52
+
53
+ //! @rst
54
+ //! Store a blocked arrangement of items across a thread block into a linear segment of items
55
+ //!
56
+ //! @blocked
57
+ //!
58
+ //! @endrst
59
+ //!
60
+ //! @tparam T
61
+ //! **[inferred]** The data type to store.
62
+ //!
63
+ //! @tparam ITEMS_PER_THREAD
64
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
65
+ //!
66
+ //! @tparam OutputIteratorT
67
+ //! **[inferred]** The random-access iterator type for output @iterator.
68
+ //!
69
+ //! @param[in] linear_tid
70
+ //! A suitable 1D thread-identifier for the calling thread
71
+ //! (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
72
+ //!
73
+ //! @param[in] block_itr
74
+ //! The thread block's base output iterator for storing to
75
+ //!
76
+ //! @param[in] items
77
+ //! Data to store
78
+ template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
79
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
80
+ StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
81
+ {
82
+ OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
83
+
84
+ // Store directly in thread-blocked order
85
+ _CCCL_PRAGMA_UNROLL_FULL()
86
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
87
+ {
88
+ thread_itr[ITEM] = items[ITEM];
89
+ }
90
+ }
91
+
92
+ //! @rst
93
+ //! Store a blocked arrangement of items across a
94
+ //! thread block into a linear segment of items, guarded by range
95
+ //!
96
+ //! @blocked
97
+ //!
98
+ //! @endrst
99
+ //!
100
+ //! @tparam T
101
+ //! **[inferred]** The data type to store.
102
+ //!
103
+ //! @tparam ITEMS_PER_THREAD
104
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
105
+ //!
106
+ //! @tparam OutputIteratorT
107
+ //! **[inferred]** The random-access iterator type for output @iterator.
108
+ //!
109
+ //! @param[in] linear_tid
110
+ //! A suitable 1D thread-identifier for the calling thread
111
+ //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
112
+ //!
113
+ //! @param[in] block_itr
114
+ //! The thread block's base output iterator for storing to
115
+ //!
116
+ //! @param[in] items
117
+ //! Data to store
118
+ //!
119
+ //! @param[in] valid_items
120
+ //! Number of valid items to write
121
+ template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
122
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
123
+ StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
124
+ {
125
+ OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
126
+
127
+ // Store directly in thread-blocked order
128
+ _CCCL_PRAGMA_UNROLL_FULL()
129
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
130
+ {
131
+ if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
132
+ {
133
+ thread_itr[ITEM] = items[ITEM];
134
+ }
135
+ }
136
+ }
137
+
138
+ //! @rst
139
+ //! Store a blocked arrangement of items across a
140
+ //! thread block into a linear segment of items.
141
+ //!
142
+ //! @blocked
143
+ //!
144
+ //! The output offset (``block_ptr + block_offset``) must be quad-item aligned,
145
+ //! which is the default starting offset returned by ``cudaMalloc()``
146
+ //!
147
+ //! The following conditions will prevent vectorization and storing will
148
+ //! fall back to cub::BLOCK_STORE_DIRECT:
149
+ //!
150
+ //! - ``ITEMS_PER_THREAD`` is odd
151
+ //! - The data type ``T`` is not a built-in primitive or CUDA vector type
152
+ //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
153
+ //!
154
+ //! @endrst
155
+ //!
156
+ //! @tparam T
157
+ //! **[inferred]** The data type to store.
158
+ //!
159
+ //! @tparam ITEMS_PER_THREAD
160
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
161
+ //!
162
+ //! @param[in] linear_tid
163
+ //! A suitable 1D thread-identifier for the calling thread
164
+ //! (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
165
+ //!
166
+ //! @param[in] block_ptr
167
+ //! Input pointer for storing from
168
+ //!
169
+ //! @param[in] items
170
+ //! Data to store
171
+ template <typename T, int ITEMS_PER_THREAD>
172
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
173
+ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
174
+ {
175
+ enum
176
+ {
177
+ // Maximum CUDA vector size is 4 elements
178
+ MAX_VEC_SIZE = _CUDA_VSTD::min(4, ITEMS_PER_THREAD),
179
+
180
+ // Vector size must be a power of two and an even divisor of the items per thread
181
+ VEC_SIZE =
182
+ ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? MAX_VEC_SIZE : 1,
183
+
184
+ VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
185
+ };
186
+
187
+ // Vector type
188
+ using Vector = typename CubVector<T, VEC_SIZE>::Type;
189
+
190
+ // Add the alignment check to ensure the vectorized storing can proceed.
191
+ if (reinterpret_cast<uintptr_t>(block_ptr) % (alignof(Vector)) == 0)
192
+ {
193
+ // Alias global pointer
194
+ Vector* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
195
+
196
+ // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
197
+ Vector raw_vector[VECTORS_PER_THREAD];
198
+ T* raw_items = reinterpret_cast<T*>(raw_vector);
199
+
200
+ // Copy
201
+ _CCCL_PRAGMA_UNROLL_FULL()
202
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
203
+ {
204
+ raw_items[ITEM] = items[ITEM];
205
+ }
206
+
207
+ // Direct-store using vector types
208
+ StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
209
+ }
210
+ else
211
+ {
212
+ // Direct-store using original type when the address is misaligned
213
+ StoreDirectBlocked(linear_tid, block_ptr, items);
214
+ }
215
+ }
216
+
217
+ //! @} end member group
218
+ //! @name Striped arrangement I/O (direct)
219
+ //! @{
220
+
221
+ //! @rst
222
+ //! Store a striped arrangement of data across the thread block into a
223
+ //! linear segment of items.
224
+ //!
225
+ //! @striped
226
+ //!
227
+ //! @endrst
228
+ //!
229
+ //! @tparam BLOCK_THREADS
230
+ //! The thread block size in threads
231
+ //!
232
+ //! @tparam T
233
+ //! **[inferred]** The data type to store.
234
+ //!
235
+ //! @tparam ITEMS_PER_THREAD
236
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
237
+ //!
238
+ //! @tparam OutputIteratorT
239
+ //! **[inferred]** The random-access iterator type for output @iterator.
240
+ //!
241
+ //! @param[in] linear_tid
242
+ //! A suitable 1D thread-identifier for the calling thread
243
+ //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
244
+ //!
245
+ //! @param[in] block_itr
246
+ //! The thread block's base output iterator for storing to
247
+ //!
248
+ //! @param[in] items
249
+ //! Data to store
250
+ template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
251
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
252
+ StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
253
+ {
254
+ OutputIteratorT thread_itr = block_itr + linear_tid;
255
+
256
+ // Store directly in striped order
257
+ _CCCL_PRAGMA_UNROLL_FULL()
258
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
259
+ {
260
+ thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
261
+ }
262
+ }
263
+
264
+ //! @rst
265
+ //! Store a striped arrangement of data across the thread block into
266
+ //! a linear segment of items, guarded by range
267
+ //!
268
+ //! @striped
269
+ //!
270
+ //! @endrst
271
+ //!
272
+ //! @tparam BLOCK_THREADS
273
+ //! The thread block size in threads
274
+ //!
275
+ //! @tparam T
276
+ //! **[inferred]** The data type to store.
277
+ //!
278
+ //! @tparam ITEMS_PER_THREAD
279
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
280
+ //!
281
+ //! @tparam OutputIteratorT
282
+ //! **[inferred]** The random-access iterator type for output @iterator.
283
+ //!
284
+ //! @param[in] linear_tid
285
+ //! A suitable 1D thread-identifier for the calling thread
286
+ //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
287
+ //!
288
+ //! @param[in] block_itr
289
+ //! The thread block's base output iterator for storing to
290
+ //!
291
+ //! @param[in] items
292
+ //! Data to store
293
+ //!
294
+ //! @param[in] valid_items
295
+ //! Number of valid items to write
296
+ template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
297
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
298
+ StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
299
+ {
300
+ OutputIteratorT thread_itr = block_itr + linear_tid;
301
+
302
+ // Store directly in striped order
303
+ _CCCL_PRAGMA_UNROLL_FULL()
304
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
305
+ {
306
+ if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
307
+ {
308
+ thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
309
+ }
310
+ }
311
+ }
312
+
313
+ //! @} end member group
314
+ //! @name Warp-striped arrangement I/O (direct)
315
+ //! @{
316
+
317
+ //! @rst
318
+ //! Store a warp-striped arrangement of data across the
319
+ //! thread block into a linear segment of items.
320
+ //!
321
+ //! @warpstriped
322
+ //!
323
+ //! Usage Considerations
324
+ //! ++++++++++++++++++++
325
+ //!
326
+ //! The number of threads in the thread block must be a multiple of the architecture's warp size.
327
+ //!
328
+ //! @endrst
329
+ //!
330
+ //! @tparam T
331
+ //! **[inferred]** The data type to store.
332
+ //!
333
+ //! @tparam ITEMS_PER_THREAD
334
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
335
+ //!
336
+ //! @tparam OutputIteratorT
337
+ //! **[inferred]** The random-access iterator type for output @iterator.
338
+ //!
339
+ //! @param[in] linear_tid
340
+ //! A suitable 1D thread-identifier for the calling thread
341
+ //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
342
+ //!
343
+ //! @param[in] block_itr
344
+ //! The thread block's base output iterator for storing to
345
+ //!
346
+ //! @param[out] items
347
+ //! Data to load
348
+ template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
349
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
350
+ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
351
+ {
352
+ int tid = linear_tid & (detail::warp_threads - 1);
353
+ int wid = linear_tid >> detail::log2_warp_threads;
354
+ int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
355
+
356
+ OutputIteratorT thread_itr = block_itr + warp_offset + tid;
357
+
358
+ // Store directly in warp-striped order
359
+ _CCCL_PRAGMA_UNROLL_FULL()
360
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
361
+ {
362
+ thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
363
+ }
364
+ }
365
+
366
+ //! @rst
367
+ //! Store a warp-striped arrangement of data across the thread block into a
368
+ //! linear segment of items, guarded by range
369
+ //!
370
+ //! @warpstriped
371
+ //!
372
+ //! Usage Considerations
373
+ //! ++++++++++++++++++++
374
+ //!
375
+ //! The number of threads in the thread block must be a multiple of the architecture's warp size.
376
+ //!
377
+ //! @endrst
378
+ //!
379
+ //! @tparam T
380
+ //! **[inferred]** The data type to store.
381
+ //!
382
+ //! @tparam ITEMS_PER_THREAD
383
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
384
+ //!
385
+ //! @tparam OutputIteratorT
386
+ //! **[inferred]** The random-access iterator type for output @iterator.
387
+ //!
388
+ //! @param[in] linear_tid
389
+ //! A suitable 1D thread-identifier for the calling thread
390
+ //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
391
+ //!
392
+ //! @param[in] block_itr
393
+ //! The thread block's base output iterator for storing to
394
+ //!
395
+ //! @param[in] items
396
+ //! Data to store
397
+ //!
398
+ //! @param[in] valid_items
399
+ //! Number of valid items to write
400
+ template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
401
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
402
+ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
403
+ {
404
+ int tid = linear_tid & (detail::warp_threads - 1);
405
+ int wid = linear_tid >> detail::log2_warp_threads;
406
+ int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
407
+
408
+ OutputIteratorT thread_itr = block_itr + warp_offset + tid;
409
+
410
+ // Store directly in warp-striped order
411
+ _CCCL_PRAGMA_UNROLL_FULL()
412
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
413
+ {
414
+ if (warp_offset + tid + (ITEM * detail::warp_threads) < valid_items)
415
+ {
416
+ thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
417
+ }
418
+ }
419
+ }
420
+
421
+ //! @} end member group
422
+
423
+ //-----------------------------------------------------------------------------
424
+ // Generic BlockStore abstraction
425
+ //-----------------------------------------------------------------------------
426
+
427
+ //! cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a
428
+ //! blocked arrangement of items across a CUDA thread block to a linear segment of memory.
429
+ enum BlockStoreAlgorithm
430
+ {
431
+ //! @rst
432
+ //! Overview
433
+ //! ++++++++++++++++++++++++++
434
+ //!
435
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
436
+ //!
437
+ //! Performance Considerations
438
+ //! ++++++++++++++++++++++++++
439
+ //!
440
+ //! - The utilization of memory transactions (coalescing) decreases as the
441
+ //! access stride between threads increases (i.e., the number items per thread).
442
+ //!
443
+ //! @endrst
444
+ BLOCK_STORE_DIRECT,
445
+
446
+ //! @rst
447
+ //! Overview
448
+ //! ++++++++++++++++++++++++++
449
+ //!
450
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
451
+ //!
452
+ //! Performance Considerations
453
+ //! ++++++++++++++++++++++++++
454
+ //!
455
+ //! The utilization of memory transactions (coalescing) remains high regardless
456
+ //! of items written per thread.
457
+ //!
458
+ //! @endrst
459
+ BLOCK_STORE_STRIPED,
460
+
461
+ //! @rst
462
+ //! Overview
463
+ //! ++++++++++++++++++++++++++
464
+ //!
465
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly
466
+ //! to memory using CUDA's built-in vectorized stores as a coalescing optimization.
467
+ //! For example, ``st.global.v4.s32`` instructions will be generated
468
+ //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
469
+ //!
470
+ //! Performance Considerations
471
+ //! ++++++++++++++++++++++++++
472
+ //!
473
+ //! - The utilization of memory transactions (coalescing) remains high until the the
474
+ //! access stride between threads (i.e., the number items per thread) exceeds the
475
+ //! maximum vector store width (typically 4 items or 64B, whichever is lower).
476
+ //! - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
477
+ //!
478
+ //! - ``ITEMS_PER_THREAD`` is odd
479
+ //! - The ``OutputIteratorT`` is not a simple pointer type
480
+ //! - The block output offset is not quadword-aligned
481
+ //! - The data type ``T`` is not a built-in primitive or CUDA vector type
482
+ //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
483
+ //!
484
+ //! @endrst
485
+ BLOCK_STORE_VECTORIZE,
486
+
487
+ //! @rst
488
+ //! Overview
489
+ //! ++++++++++++++++++++++++++
490
+ //!
491
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
492
+ //! transposed and then efficiently written to memory as a :ref:`striped arrangement <flexible-data-arrangement>`.
493
+ //!
494
+ //! Performance Considerations
495
+ //! ++++++++++++++++++++++++++
496
+ //!
497
+ //! - The utilization of memory transactions (coalescing) remains high regardless
498
+ //! of items written per thread.
499
+ //! - The local reordering incurs slightly longer latencies and throughput than the
500
+ //! direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
501
+ //!
502
+ //! @endrst
503
+ BLOCK_STORE_TRANSPOSE,
504
+
505
+ //! @rst
506
+ //! Overview
507
+ //! ++++++++++++++++++++++++++
508
+ //!
509
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
510
+ //! transposed and then efficiently written to memory as a
511
+ //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
512
+ //!
513
+ //! Usage Considerations
514
+ //! ++++++++++++++++++++++++++
515
+ //!
516
+ //! - BLOCK_THREADS must be a multiple of WARP_THREADS
517
+ //!
518
+ //! Performance Considerations
519
+ //! ++++++++++++++++++++++++++
520
+ //!
521
+ //! - The utilization of memory transactions (coalescing) remains high regardless
522
+ //! of items written per thread.
523
+ //! - The local reordering incurs slightly longer latencies and throughput than the
524
+ //! direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
525
+ //!
526
+ //! @endrst
527
+ BLOCK_STORE_WARP_TRANSPOSE,
528
+
529
+ //! @rst
530
+ //! Overview
531
+ //! ++++++++++++++++++++++++++
532
+ //!
533
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
534
+ //! transposed and then efficiently written to memory as a
535
+ //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
536
+ //! To reduce the shared memory requirement, only one warp's worth of shared
537
+ //! memory is provisioned and is subsequently time-sliced among warps.
538
+ //!
539
+ //! Usage Considerations
540
+ //! ++++++++++++++++++++++++++
541
+ //!
542
+ //! - BLOCK_THREADS must be a multiple of WARP_THREADS
543
+ //!
544
+ //! Performance Considerations
545
+ //! ++++++++++++++++++++++++++
546
+ //!
547
+ //! - The utilization of memory transactions (coalescing) remains high regardless
548
+ //! of items written per thread.
549
+ //! - Provisions less shared memory temporary storage, but incurs larger
550
+ //! latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
551
+ //!
552
+ //! @endrst
553
+ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
554
+ };
555
+
556
+ //! @rst
557
+ //! The BlockStore class provides :ref:`collective <collective-primitives>` data movement
558
+ //! methods for writing a :ref:`blocked arrangement <flexible-data-arrangement>` of items
559
+ //! partitioned across a CUDA thread block to a linear segment of memory.
560
+ //!
561
+ //! Overview
562
+ //! +++++++++++++++++++++++++++++++++++++++++++++
563
+ //!
564
+ //! - The BlockStore class provides a single data movement abstraction that can be specialized
565
+ //! to implement different cub::BlockStoreAlgorithm strategies. This facilitates different
566
+ //! performance policies for different architectures, data types, granularity sizes, etc.
567
+ //! - BlockStore can be optionally specialized by different data movement strategies:
568
+ //!
569
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_DIRECT`:
570
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
571
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_STRIPED`:
572
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
573
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_VECTORIZE`:
574
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory
575
+ //! using CUDA's built-in vectorized stores as a coalescing optimization.
576
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_TRANSPOSE`:
577
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
578
+ //! a :ref:`striped arrangement <flexible-data-arrangement>` which is then written to memory.
579
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE`:
580
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
581
+ //! a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
582
+ //! #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED`:
583
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
584
+ //! a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
585
+ //! To reduce the shared memory requireent, only one warp's worth of shared memory is provisioned and is
586
+ //! subsequently time-sliced among warps.
587
+ //!
588
+ //! - @rowmajor
589
+ //!
590
+ //! A Simple Example
591
+ //! +++++++++++++++++++++++++++++++++++++++++++++
592
+ //!
593
+ //! @blockcollective{BlockStore}
594
+ //!
595
+ //! The code snippet below illustrates the storing of a "blocked" arrangement
596
+ //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
597
+ //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
598
+ //! meaning items are locally reordered among threads so that memory references will be
599
+ //! efficiently coalesced using a warp-striped access pattern.
600
+ //!
601
+ //! .. code-block:: c++
602
+ //!
603
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
604
+ //!
605
+ //! __global__ void ExampleKernel(int *d_data, ...)
606
+ //! {
607
+ //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
608
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
609
+ //!
610
+ //! // Allocate shared memory for BlockStore
611
+ //! __shared__ typename BlockStore::TempStorage temp_storage;
612
+ //!
613
+ //! // Obtain a segment of consecutive items that are blocked across threads
614
+ //! int thread_data[4];
615
+ //! ...
616
+ //!
617
+ //! // Store items to linear memory
618
+ //! BlockStore(temp_storage).Store(d_data, thread_data);
619
+ //!
620
+ //! Suppose the set of ``thread_data`` across the block of threads is
621
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
622
+ //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
623
+ //!
624
+ //! Re-using dynamically allocating shared memory
625
+ //! +++++++++++++++++++++++++++++++++++++++++++++
626
+ //!
627
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
628
+ //! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
629
+ //! This example can be easily adapted to the storage required by BlockStore.
630
+ //!
631
+ //! @endrst
632
+ //!
633
+ //! @tparam T
634
+ //! The type of data to be written.
635
+ //!
636
+ //! @tparam BLOCK_DIM_X
637
+ //! The thread block length in threads along the X dimension
638
+ //!
639
+ //! @tparam ITEMS_PER_THREAD
640
+ //! The number of consecutive items partitioned onto each thread.
641
+ //!
642
+ //! @tparam ALGORITHM
643
+ //! **[optional]** cub::BlockStoreAlgorithm tuning policy enumeration (default: cub::BLOCK_STORE_DIRECT)
644
+ //!
645
+ //! @tparam BLOCK_DIM_Y
646
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
647
+ //!
648
+ //! @tparam BLOCK_DIM_Z
649
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
650
+ //!
651
+ template <typename T,
652
+ int BLOCK_DIM_X,
653
+ int ITEMS_PER_THREAD,
654
+ BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
655
+ int BLOCK_DIM_Y = 1,
656
+ int BLOCK_DIM_Z = 1>
657
+ class BlockStore
658
+ {
659
+ private:
660
+ enum
661
+ {
662
+ /// The thread block size in threads
663
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
664
+ };
665
+
666
+ /// Store helper
667
+ template <BlockStoreAlgorithm _POLICY, int DUMMY>
668
+ struct StoreInternal;
669
+
670
+ template <int DUMMY>
671
+ struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
672
+ {
673
+ /// Shared memory storage layout type
674
+ using TempStorage = NullType;
675
+
676
+ /// Linear thread-id
677
+ int linear_tid;
678
+
679
+ /// Constructor
680
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
681
+ : linear_tid(linear_tid)
682
+ {}
683
+
684
+ /**
685
+ * @brief Store items into a linear segment of memory
686
+ *
687
+ * @param[in] block_itr
688
+ * The thread block's base output iterator for storing to
689
+ *
690
+ * @param[in] items
691
+ * Data to store
692
+ */
693
+ template <typename OutputIteratorT>
694
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
695
+ {
696
+ StoreDirectBlocked(linear_tid, block_itr, items);
697
+ }
698
+
699
+ /**
700
+ * @brief Store items into a linear segment of memory, guarded by range
701
+ *
702
+ * @param[in] block_itr
703
+ * The thread block's base output iterator for storing to
704
+ *
705
+ * @param[in] items
706
+ * Data to store
707
+ *
708
+ * @param[in] valid_items
709
+ * Number of valid items to write
710
+ */
711
+ template <typename OutputIteratorT>
712
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
713
+ {
714
+ StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
715
+ }
716
+ };
717
+
718
+ /**
719
+ * BLOCK_STORE_STRIPED specialization of store helper
720
+ */
721
+ template <int DUMMY>
722
+ struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
723
+ {
724
+ /// Shared memory storage layout type
725
+ using TempStorage = NullType;
726
+
727
+ /// Linear thread-id
728
+ int linear_tid;
729
+
730
+ /// Constructor
731
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
732
+ : linear_tid(linear_tid)
733
+ {}
734
+
735
+ /**
736
+ * @brief Store items into a linear segment of memory
737
+ *
738
+ * @param[in] block_itr
739
+ * The thread block's base output iterator for storing to
740
+ *
741
+ * @param[in] items
742
+ * Data to store
743
+ */
744
+ template <typename OutputIteratorT>
745
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
746
+ {
747
+ StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
748
+ }
749
+
750
+ /**
751
+ * @brief Store items into a linear segment of memory, guarded by range
752
+ *
753
+ * @param[in] block_itr
754
+ * The thread block's base output iterator for storing to
755
+ *
756
+ * @param[in] items
757
+ * Data to store
758
+ *
759
+ * @param[in] valid_items
760
+ * Number of valid items to write
761
+ */
762
+ template <typename OutputIteratorT>
763
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
764
+ {
765
+ StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
766
+ }
767
+ };
768
+
769
+ /**
770
+ * BLOCK_STORE_VECTORIZE specialization of store helper
771
+ */
772
+ template <int DUMMY>
773
+ struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
774
+ {
775
+ /// Shared memory storage layout type
776
+ using TempStorage = NullType;
777
+
778
+ /// Linear thread-id
779
+ int linear_tid;
780
+
781
+ /// Constructor
782
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
783
+ : linear_tid(linear_tid)
784
+ {}
785
+
786
+ /**
787
+ * @brief Store items into a linear segment of memory,
788
+ * specialized for native pointer types (attempts vectorization)
789
+ *
790
+ * @param[in] block_ptr
791
+ * The thread block's base output iterator for storing to
792
+ *
793
+ * @param[in] items
794
+ * Data to store
795
+ */
796
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* block_ptr, T (&items)[ITEMS_PER_THREAD])
797
+ {
798
+ StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
799
+ }
800
+
801
+ /**
802
+ * @brief Store items into a linear segment of memory,
803
+ * specialized for opaque input iterators (skips vectorization)
804
+ *
805
+ * @param[in] block_itr
806
+ * The thread block's base output iterator for storing to
807
+ *
808
+ * @param[in] items
809
+ * Data to store
810
+ */
811
+ template <typename OutputIteratorT>
812
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
813
+ {
814
+ StoreDirectBlocked(linear_tid, block_itr, items);
815
+ }
816
+
817
+ /**
818
+ * @brief Store items into a linear segment of memory, guarded by range
819
+ *
820
+ * @param[in] block_itr
821
+ * The thread block's base output iterator for storing to
822
+ *
823
+ * @param[in] items
824
+ * Data to store
825
+ *
826
+ * @param[in] valid_items
827
+ * Number of valid items to write
828
+ */
829
+ template <typename OutputIteratorT>
830
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
831
+ {
832
+ StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
833
+ }
834
+ };
835
+
836
+ /**
837
+ * BLOCK_STORE_TRANSPOSE specialization of store helper
838
+ */
839
+ template <int DUMMY>
840
+ struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
841
+ {
842
+ // BlockExchange utility type for keys
843
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
844
+
845
+ /// Shared memory storage layout type
846
+ struct _TempStorage : BlockExchange::TempStorage
847
+ {
848
+ /// Temporary storage for partially-full block guard
849
+ volatile int valid_items;
850
+ };
851
+
852
+ /// Alias wrapper allowing storage to be unioned
853
+ struct TempStorage : Uninitialized<_TempStorage>
854
+ {};
855
+
856
+ /// Thread reference to shared storage
857
+ _TempStorage& temp_storage;
858
+
859
+ /// Linear thread-id
860
+ int linear_tid;
861
+
862
+ /// Constructor
863
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
864
+ : temp_storage(temp_storage.Alias())
865
+ , linear_tid(linear_tid)
866
+ {}
867
+
868
+ /**
869
+ * @brief Store items into a linear segment of memory
870
+ *
871
+ * @param[in] block_itr
872
+ * The thread block's base output iterator for storing to
873
+ *
874
+ * @param[in] items
875
+ * Data to store
876
+ */
877
+ template <typename OutputIteratorT>
878
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
879
+ {
880
+ BlockExchange(temp_storage).BlockedToStriped(items);
881
+ StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
882
+ }
883
+
884
+ /**
885
+ * @brief Store items into a linear segment of memory, guarded by range
886
+ *
887
+ * @param[in] block_itr
888
+ * The thread block's base output iterator for storing to
889
+ *
890
+ * @param[in] items
891
+ * Data to store
892
+ *
893
+ * @param[in] valid_items
894
+ * Number of valid items to write
895
+ */
896
+ template <typename OutputIteratorT>
897
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
898
+ {
899
+ BlockExchange(temp_storage).BlockedToStriped(items);
900
+ if (linear_tid == 0)
901
+ {
902
+ // Move through volatile smem as a workaround to prevent RF spilling on
903
+ // subsequent loads
904
+ temp_storage.valid_items = valid_items;
905
+ }
906
+ __syncthreads();
907
+ StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
908
+ }
909
+ };
910
+
911
+ /**
912
+ * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
913
+ */
914
+ template <int DUMMY>
915
+ struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
916
+ {
917
+ enum
918
+ {
919
+ WARP_THREADS = detail::warp_threads
920
+ };
921
+
922
+ // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
923
+ static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
924
+
925
+ // BlockExchange utility type for keys
926
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
927
+
928
+ /// Shared memory storage layout type
929
+ struct _TempStorage : BlockExchange::TempStorage
930
+ {
931
+ /// Temporary storage for partially-full block guard
932
+ volatile int valid_items;
933
+ };
934
+
935
+ /// Alias wrapper allowing storage to be unioned
936
+ struct TempStorage : Uninitialized<_TempStorage>
937
+ {};
938
+
939
+ /// Thread reference to shared storage
940
+ _TempStorage& temp_storage;
941
+
942
+ /// Linear thread-id
943
+ int linear_tid;
944
+
945
+ /// Constructor
946
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
947
+ : temp_storage(temp_storage.Alias())
948
+ , linear_tid(linear_tid)
949
+ {}
950
+
951
+ /**
952
+ * @brief Store items into a linear segment of memory
953
+ *
954
+ * @param[in] block_itr
955
+ * The thread block's base output iterator for storing to
956
+ *
957
+ * @param[in] items
958
+ * Data to store
959
+ */
960
+ template <typename OutputIteratorT>
961
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
962
+ {
963
+ BlockExchange(temp_storage).BlockedToWarpStriped(items);
964
+ StoreDirectWarpStriped(linear_tid, block_itr, items);
965
+ }
966
+
967
+ /**
968
+ * @brief Store items into a linear segment of memory, guarded by range
969
+ *
970
+ * @param[in] block_itr
971
+ * The thread block's base output iterator for storing to
972
+ *
973
+ * @param[in] items
974
+ * Data to store
975
+ *
976
+ * @param[in] valid_items
977
+ * Number of valid items to write
978
+ */
979
+ template <typename OutputIteratorT>
980
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
981
+ {
982
+ BlockExchange(temp_storage).BlockedToWarpStriped(items);
983
+ if (linear_tid == 0)
984
+ {
985
+ // Move through volatile smem as a workaround to prevent RF spilling on
986
+ // subsequent loads
987
+ temp_storage.valid_items = valid_items;
988
+ }
989
+ __syncthreads();
990
+ StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
991
+ }
992
+ };
993
+
994
+ /**
995
+ * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
996
+ */
997
+ template <int DUMMY>
998
+ struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
999
+ {
1000
+ enum
1001
+ {
1002
+ WARP_THREADS = detail::warp_threads
1003
+ };
1004
+
1005
+ // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
1006
+ static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
1007
+
1008
+ // BlockExchange utility type for keys
1009
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
1010
+
1011
+ /// Shared memory storage layout type
1012
+ struct _TempStorage : BlockExchange::TempStorage
1013
+ {
1014
+ /// Temporary storage for partially-full block guard
1015
+ volatile int valid_items;
1016
+ };
1017
+
1018
+ /// Alias wrapper allowing storage to be unioned
1019
+ struct TempStorage : Uninitialized<_TempStorage>
1020
+ {};
1021
+
1022
+ /// Thread reference to shared storage
1023
+ _TempStorage& temp_storage;
1024
+
1025
+ /// Linear thread-id
1026
+ int linear_tid;
1027
+
1028
+ /// Constructor
1029
+ _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
1030
+ : temp_storage(temp_storage.Alias())
1031
+ , linear_tid(linear_tid)
1032
+ {}
1033
+
1034
+ /**
1035
+ * @brief Store items into a linear segment of memory
1036
+ *
1037
+ * @param[in] block_itr
1038
+ * The thread block's base output iterator for storing to
1039
+ *
1040
+ * @param[in] items
1041
+ * Data to store
1042
+ */
1043
+ template <typename OutputIteratorT>
1044
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
1045
+ {
1046
+ BlockExchange(temp_storage).BlockedToWarpStriped(items);
1047
+ StoreDirectWarpStriped(linear_tid, block_itr, items);
1048
+ }
1049
+
1050
+ /**
1051
+ * @brief Store items into a linear segment of memory, guarded by range
1052
+ *
1053
+ * @param[in] block_itr
1054
+ * The thread block's base output iterator for storing to
1055
+ *
1056
+ * @param[in] items
1057
+ * Data to store
1058
+ *
1059
+ * @param[in] valid_items
1060
+ * Number of valid items to write
1061
+ */
1062
+ template <typename OutputIteratorT>
1063
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
1064
+ {
1065
+ BlockExchange(temp_storage).BlockedToWarpStriped(items);
1066
+ if (linear_tid == 0)
1067
+ {
1068
+ // Move through volatile smem as a workaround to prevent RF spilling on
1069
+ // subsequent loads
1070
+ temp_storage.valid_items = valid_items;
1071
+ }
1072
+ __syncthreads();
1073
+ StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
1074
+ }
1075
+ };
1076
+
1077
+ /// Internal load implementation to use
1078
+ using InternalStore = StoreInternal<ALGORITHM, 0>;
1079
+
1080
+ /// Shared memory storage layout type
1081
+ using _TempStorage = typename InternalStore::TempStorage;
1082
+
1083
+ /// Internal storage allocator
1084
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
1085
+ {
1086
+ __shared__ _TempStorage private_storage;
1087
+ return private_storage;
1088
+ }
1089
+
1090
+ /// Thread reference to shared storage
1091
+ _TempStorage& temp_storage;
1092
+
1093
+ /// Linear thread-id
1094
+ int linear_tid;
1095
+
1096
+ public:
1097
+ //! @smemstorage{BlockStore}
1098
+ struct TempStorage : Uninitialized<_TempStorage>
1099
+ {};
1100
+
1101
+ //! @name Collective constructors
1102
+ //! @{
1103
+
1104
+ /**
1105
+ * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
1106
+ */
1107
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore()
1108
+ : temp_storage(PrivateStorage())
1109
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1110
+ {}
1111
+
1112
+ /**
1113
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
1114
+ *
1115
+ * @param temp_storage[in]
1116
+ * Reference to memory allocation having layout type TempStorage
1117
+ */
1118
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore(TempStorage& temp_storage)
1119
+ : temp_storage(temp_storage.Alias())
1120
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1121
+ {}
1122
+
1123
+ //! @} end member group
1124
+ //! @name Data movement
1125
+ //! @{
1126
+
1127
+ //! @rst
1128
+ //! Store items into a linear segment of memory
1129
+ //!
1130
+ //! - @blocked
1131
+ //! - @smemreuse
1132
+ //!
1133
+ //! Snippet
1134
+ //! +++++++
1135
+ //!
1136
+ //! The code snippet below illustrates the storing of a "blocked" arrangement
1137
+ //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
1138
+ //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
1139
+ //! meaning items are locally reordered among threads so that memory references will be
1140
+ //! efficiently coalesced using a warp-striped access pattern.
1141
+ //!
1142
+ //! .. code-block:: c++
1143
+ //!
1144
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
1145
+ //!
1146
+ //! __global__ void ExampleKernel(int *d_data, ...)
1147
+ //! {
1148
+ //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
1149
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
1150
+ //!
1151
+ //! // Allocate shared memory for BlockStore
1152
+ //! __shared__ typename BlockStore::TempStorage temp_storage;
1153
+ //!
1154
+ //! // Obtain a segment of consecutive items that are blocked across threads
1155
+ //! int thread_data[4];
1156
+ //! ...
1157
+ //!
1158
+ //! // Store items to linear memory
1159
+ //! int thread_data[4];
1160
+ //! BlockStore(temp_storage).Store(d_data, thread_data);
1161
+ //!
1162
+ //! Suppose the set of ``thread_data`` across the block of threads is
1163
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
1164
+ //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
1165
+ //!
1166
+ //! @endrst
1167
+ //!
1168
+ //! @param block_itr[out]
1169
+ //! The thread block's base output iterator for storing to
1170
+ //!
1171
+ //! @param items[in]
1172
+ //! Data to store
1173
+ template <typename OutputIteratorT>
1174
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
1175
+ {
1176
+ InternalStore(temp_storage, linear_tid).Store(block_itr, items);
1177
+ }
1178
+
1179
+ //! @rst
1180
+ //! Store items into a linear segment of memory, guarded by range.
1181
+ //!
1182
+ //! - @blocked
1183
+ //! - @smemreuse
1184
+ //!
1185
+ //! Snippet
1186
+ //! +++++++
1187
+ //!
1188
+ //! The code snippet below illustrates the guarded storing of a "blocked" arrangement
1189
+ //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
1190
+ //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
1191
+ //! meaning items are locally reordered among threads so that memory references will be
1192
+ //! efficiently coalesced using a warp-striped access pattern.
1193
+ //!
1194
+ //! .. code-block:: c++
1195
+ //!
1196
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
1197
+ //!
1198
+ //! __global__ void ExampleKernel(int *d_data, int valid_items, ...)
1199
+ //! {
1200
+ //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
1201
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
1202
+ //!
1203
+ //! // Allocate shared memory for BlockStore
1204
+ //! __shared__ typename BlockStore::TempStorage temp_storage;
1205
+ //!
1206
+ //! // Obtain a segment of consecutive items that are blocked across threads
1207
+ //! int thread_data[4];
1208
+ //! ...
1209
+ //!
1210
+ //! // Store items to linear memory
1211
+ //! int thread_data[4];
1212
+ //! BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
1213
+ //!
1214
+ //! Suppose the set of ``thread_data`` across the block of threads is
1215
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
1216
+ //! The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ?, ...``, with
1217
+ //! only the first two threads being unmasked to store portions of valid data.
1218
+ //!
1219
+ //! @endrst
1220
+ //!
1221
+ //! @param block_itr[out]
1222
+ //! The thread block's base output iterator for storing to
1223
+ //!
1224
+ //! @param items[in]
1225
+ //! Data to store
1226
+ //!
1227
+ //! @param valid_items[in]
1228
+ //! Number of valid items to write
1229
+ template <typename OutputIteratorT>
1230
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
1231
+ {
1232
+ InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
1233
+ }
1234
+
1235
+ //! @} end member group
1236
+ };
1237
+
1238
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
1239
+ template <class Policy, class It, class T = cub::detail::it_value_t<It>>
1240
+ struct BlockStoreType
1241
+ {
1242
+ using type = cub::BlockStore<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::STORE_ALGORITHM>;
1243
+ };
1244
+ #endif // _CCCL_DOXYGEN_INVOKED
1245
+
1246
+ CUB_NAMESPACE_END