cuda-cccl 0.1.3.1.0.dev1486__cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1819) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +276 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +953 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
  241. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
  242. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  243. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
  244. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  245. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  247. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  248. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  249. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  259. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  260. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  261. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
  262. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  263. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  264. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  265. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  266. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  267. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  268. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  269. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  270. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  271. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  272. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  273. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  274. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  275. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  276. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  377. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  378. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  379. cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
  380. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
  381. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  382. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  383. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
  384. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  385. cuda/cccl/headers/include/cuda/access_property +26 -0
  386. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  387. cuda/cccl/headers/include/cuda/atomic +27 -0
  388. cuda/cccl/headers/include/cuda/barrier +262 -0
  389. cuda/cccl/headers/include/cuda/bit +29 -0
  390. cuda/cccl/headers/include/cuda/cmath +35 -0
  391. cuda/cccl/headers/include/cuda/discard_memory +61 -0
  392. cuda/cccl/headers/include/cuda/functional +31 -0
  393. cuda/cccl/headers/include/cuda/iterator +31 -0
  394. cuda/cccl/headers/include/cuda/latch +27 -0
  395. cuda/cccl/headers/include/cuda/mdspan +28 -0
  396. cuda/cccl/headers/include/cuda/memory +28 -0
  397. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  398. cuda/cccl/headers/include/cuda/numeric +28 -0
  399. cuda/cccl/headers/include/cuda/pipeline +579 -0
  400. cuda/cccl/headers/include/cuda/ptx +118 -0
  401. cuda/cccl/headers/include/cuda/semaphore +31 -0
  402. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
  403. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
  404. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
  405. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  406. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  407. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  408. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  409. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  495. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  496. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  497. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  498. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  499. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  500. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  501. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  502. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  517. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  518. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  519. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
  520. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  521. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  522. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  523. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  524. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  525. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  526. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  527. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  530. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  531. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
  532. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  533. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
  534. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
  535. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  536. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  537. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
  555. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  556. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  557. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  558. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  559. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  560. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
  561. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
  562. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
  563. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
  564. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
  565. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
  566. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
  567. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
  583. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
  584. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
  585. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
  586. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  587. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  588. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  589. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  590. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  591. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  592. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  593. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  594. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  595. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  596. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  597. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  598. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  599. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  600. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  601. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  602. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  603. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  605. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  606. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  607. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  608. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  609. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  610. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  611. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  612. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  613. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  614. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  615. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  616. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  617. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
  618. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  619. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  620. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  621. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
  622. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
  623. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  624. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
  625. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  626. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  627. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  628. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  629. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  630. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  631. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  632. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  633. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  634. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  635. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  637. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  638. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  639. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  640. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  641. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  642. cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
  643. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
  644. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  645. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
  646. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
  647. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  648. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  649. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  650. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  651. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  652. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  653. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  654. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  655. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
  656. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
  657. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  658. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  659. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  660. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  661. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  662. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  663. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  664. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
  665. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
  667. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  668. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  670. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  671. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  672. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  673. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  674. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  675. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  676. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  677. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  678. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  679. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  680. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  681. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  682. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  683. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  684. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  685. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  686. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  687. cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
  688. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
  689. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
  690. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
  691. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
  692. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
  693. cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
  694. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  696. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
  697. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  698. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
  699. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  700. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  701. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
  702. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
  703. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  704. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  705. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  706. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
  708. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  709. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
  710. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  711. cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
  712. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
  713. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
  724. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  725. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  726. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  727. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  728. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
  729. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
  731. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
  732. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
  733. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  734. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
  735. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
  736. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
  737. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
  738. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
  739. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
  740. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  741. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  742. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
  743. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
  744. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  745. cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
  746. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  747. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  748. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  749. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  751. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
  752. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  753. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  754. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
  755. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  756. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  757. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  758. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
  759. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
  761. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
  762. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  763. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  764. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  765. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  766. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  767. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  768. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  769. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
  770. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  771. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  772. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  773. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  774. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  775. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  776. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  777. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
  778. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
  779. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  780. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  781. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  782. cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
  783. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  784. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  785. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  786. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  787. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  788. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
  789. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  790. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  791. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  792. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  793. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
  794. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  795. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  796. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
  797. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  798. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  799. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  800. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
  801. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
  802. cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
  803. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  804. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
  805. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
  806. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  808. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  809. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  810. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  811. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  812. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
  813. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
  814. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  815. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  816. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  817. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  818. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  819. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  820. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  821. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
  822. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  823. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  824. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  825. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
  826. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  827. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  828. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  829. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  830. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  831. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  832. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  833. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
  834. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  835. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  836. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  837. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  838. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  839. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  840. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  841. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  842. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  843. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  844. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  845. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  846. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  847. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  848. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  849. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  850. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  851. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  852. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  853. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  854. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  855. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  856. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  857. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  858. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  859. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  860. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  861. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  862. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  863. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  864. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  865. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  866. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  867. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  868. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  869. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  870. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  871. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  872. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  873. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  874. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  875. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  876. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  973. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  974. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  975. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  976. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
  977. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  978. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  979. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  980. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  981. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
  982. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  983. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  984. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  985. cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
  986. cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
  987. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  988. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
  989. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  990. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  991. cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
  992. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  993. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
  994. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  995. cuda/cccl/headers/include/cuda/std/array +527 -0
  996. cuda/cccl/headers/include/cuda/std/atomic +823 -0
  997. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  998. cuda/cccl/headers/include/cuda/std/bit +35 -0
  999. cuda/cccl/headers/include/cuda/std/bitset +1026 -0
  1000. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1001. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1002. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1003. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1004. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1005. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1006. cuda/cccl/headers/include/cuda/std/complex +25 -0
  1007. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1008. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1009. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1010. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1011. cuda/cccl/headers/include/cuda/std/cstring +111 -0
  1012. cuda/cccl/headers/include/cuda/std/ctime +147 -0
  1013. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1014. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
  1015. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
  1016. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
  1017. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
  1018. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
  1019. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1020. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
  1021. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1022. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
  1023. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
  1024. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1025. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1026. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1027. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1028. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1029. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1030. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1031. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1032. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1033. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1034. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1035. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1036. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1037. cuda/cccl/headers/include/cuda/std/optional +25 -0
  1038. cuda/cccl/headers/include/cuda/std/ranges +68 -0
  1039. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1040. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1041. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1042. cuda/cccl/headers/include/cuda/std/span +640 -0
  1043. cuda/cccl/headers/include/cuda/std/string_view +814 -0
  1044. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1045. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1046. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1047. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1048. cuda/cccl/headers/include/cuda/std/version +245 -0
  1049. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1050. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1051. cuda/cccl/headers/include/cuda/version +16 -0
  1052. cuda/cccl/headers/include/cuda/warp +28 -0
  1053. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1054. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1055. cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
  1056. cuda/cccl/headers/include/nv/target +229 -0
  1057. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1058. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1059. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1060. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1061. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1062. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1063. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1064. cuda/cccl/headers/include/thrust/count.h +245 -0
  1065. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1066. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1067. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1068. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1069. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1070. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1071. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1072. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1073. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1074. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1075. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1076. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1077. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1078. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1079. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1080. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1081. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1082. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1083. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1084. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1085. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1086. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1087. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1088. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1089. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1090. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1091. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1092. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1093. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1094. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1095. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1096. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1097. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1098. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1099. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1100. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1101. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1102. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1103. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1104. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1105. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1106. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1107. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1108. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1109. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1110. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1111. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1112. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1113. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1114. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1115. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1116. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1117. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1118. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1119. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1120. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1121. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1122. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1123. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1124. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1125. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1126. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1127. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1128. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1129. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1130. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1131. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1132. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1133. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1134. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1135. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1136. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1137. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1138. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1139. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1140. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1141. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1142. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1143. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1144. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1145. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1146. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1147. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1148. cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
  1149. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1150. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
  1151. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1152. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1153. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1154. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1155. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1156. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1157. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1158. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1159. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1160. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1161. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1162. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1163. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1164. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1165. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1166. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1167. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1168. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1169. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1170. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1171. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1172. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1173. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1174. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1175. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1176. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1177. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1178. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1179. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1180. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1181. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1182. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1183. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1184. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1185. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1186. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
  1187. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1188. cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
  1189. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1190. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1191. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
  1192. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1193. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1194. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1195. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1196. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1197. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1198. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1199. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
  1200. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1201. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1202. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1203. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1204. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1205. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1206. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1207. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1208. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1209. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1210. cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
  1211. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
  1212. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1213. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1214. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1215. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1216. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1217. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1218. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1219. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1220. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1221. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1222. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1223. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1224. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1225. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1226. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1227. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1228. cuda/cccl/headers/include/thrust/find.h +382 -0
  1229. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1230. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1231. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1232. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1233. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1234. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1235. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1236. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1237. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1238. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1239. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1240. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1241. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1242. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1243. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1244. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1245. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1246. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1247. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1248. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1249. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1250. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1251. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1252. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
  1253. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1254. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1255. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1256. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
  1257. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1258. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1259. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1260. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1261. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1262. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1263. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1264. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1265. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1266. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1267. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1268. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1269. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1270. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1271. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1272. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1273. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1274. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1275. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1276. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1277. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1278. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1279. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1280. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1281. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1282. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1283. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1284. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1285. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1286. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1287. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1288. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1289. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1290. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1291. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1292. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1293. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1294. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1295. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1296. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1297. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1298. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1299. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1300. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1301. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1302. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1303. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1304. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1305. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1306. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1307. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1308. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1309. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1310. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1311. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1312. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1313. cuda/cccl/headers/include/thrust/random.h +120 -0
  1314. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1315. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1316. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1317. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1318. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1319. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1320. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1321. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1322. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1323. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1324. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1325. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1326. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1327. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1328. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1329. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1330. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1331. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1332. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1333. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1334. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1335. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1336. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1337. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1338. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1339. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1340. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1341. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1342. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1343. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1344. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1345. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1346. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1347. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1348. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1349. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1350. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1351. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1352. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1353. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1354. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1355. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1356. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1357. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1358. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1359. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1360. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1361. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1362. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1363. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1364. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1365. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1366. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1377. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1378. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1379. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1380. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1382. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1383. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1384. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1385. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1386. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1388. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
  1389. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1390. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1391. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1392. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1393. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1394. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1395. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1396. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1397. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1398. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1399. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1400. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1401. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1402. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1403. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1404. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
  1405. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1406. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1408. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1409. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1410. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1411. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1412. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1413. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1414. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1415. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1416. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1417. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
  1418. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1446. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1447. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1448. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1449. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1450. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1451. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1452. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1453. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1454. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1455. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1456. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1457. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1458. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1459. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1460. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1461. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1462. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1463. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1464. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1465. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1466. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1467. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1468. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1469. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1470. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1471. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1472. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1473. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1474. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1475. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1476. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1477. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1478. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1479. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1480. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1481. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1482. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1483. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1484. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1485. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1486. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1487. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1635. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1636. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1637. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1638. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1639. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1640. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1641. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1642. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1643. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1644. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1645. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1646. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1647. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1648. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1649. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1650. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1651. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1652. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1653. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1654. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1655. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1656. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1657. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1658. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1659. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1660. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1661. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1662. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1663. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1664. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1665. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1666. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1668. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1669. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1670. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1671. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1672. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1673. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1674. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1675. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1702. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1703. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1704. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1705. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1706. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1707. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1708. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1709. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1710. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1711. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1712. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1713. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1714. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1715. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1716. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1717. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1718. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1724. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1725. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1726. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1727. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1728. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1730. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1731. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1732. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1734. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1735. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1736. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1737. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1738. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1739. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1740. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1741. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1742. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1767. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1768. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1769. cuda/cccl/headers/include/thrust/transform.h +903 -0
  1770. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1771. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1772. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1773. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1774. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
  1775. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1776. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1777. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1778. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
  1779. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1780. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
  1781. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1782. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1783. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1784. cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
  1785. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1786. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1787. cuda/cccl/headers/include/thrust/version.h +93 -0
  1788. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1789. cuda/cccl/headers/include_paths.py +72 -0
  1790. cuda/cccl/parallel/__init__.py +3 -0
  1791. cuda/cccl/parallel/experimental/__init__.py +3 -0
  1792. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1793. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1794. cuda/cccl/parallel/experimental/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1795. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1796. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1797. cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
  1798. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1799. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1800. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1801. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1802. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1803. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1804. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1805. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1806. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1807. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1808. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1809. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1810. cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
  1811. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1812. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1813. cuda/cccl/parallel/experimental/struct.py +150 -0
  1814. cuda/cccl/parallel/experimental/typing.py +27 -0
  1815. cuda/cccl/py.typed +0 -0
  1816. cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
  1817. cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
  1818. cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
  1819. cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1259 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
31
+
32
+ #pragma once
33
+
34
+ #include <cub/config.cuh>
35
+
36
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
37
+ # pragma GCC system_header
38
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
39
+ # pragma clang system_header
40
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
41
+ # pragma system_header
42
+ #endif // no system header
43
+
44
+ #include <cub/block/block_exchange.cuh>
45
+ #include <cub/iterator/cache_modified_input_iterator.cuh>
46
+ #include <cub/util_ptx.cuh>
47
+ #include <cub/util_type.cuh>
48
+
49
+ CUB_NAMESPACE_BEGIN
50
+
51
+ //! @name Blocked arrangement I/O (direct)
52
+ //! @{
53
+
54
+ //! @rst
55
+ //! Load a linear segment of items into a blocked arrangement across the thread block.
56
+ //!
57
+ //! @blocked
58
+ //! @endrst
59
+ //!
60
+ //! @tparam T
61
+ //! **[inferred]** The data type to load.
62
+ //!
63
+ //! @tparam ITEMS_PER_THREAD
64
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
65
+ //!
66
+ //! @tparam RandomAccessIterator
67
+ //! **[inferred]** The random-access iterator type for input iterator.
68
+ //!
69
+ //! @param[in] linear_tid
70
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
71
+ //! thread blocks)
72
+ //!
73
+ //! @param[in] block_src_it
74
+ //! The thread block's base input iterator for loading from
75
+ //!
76
+ //! @param[out] dst_items
77
+ //! Destination to load data into
78
+ template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
79
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
80
+ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
81
+ {
82
+ // Load directly in thread-blocked order
83
+ _CCCL_PRAGMA_UNROLL_FULL()
84
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
85
+ {
86
+ dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
87
+ }
88
+ }
89
+
90
+ //! @rst
91
+ //! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
92
+ //!
93
+ //! @blocked
94
+ //!
95
+ //! @endrst
96
+ //!
97
+ //! @tparam T
98
+ //! **[inferred]** The data type to load.
99
+ //!
100
+ //! @tparam ITEMS_PER_THREAD
101
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
102
+ //!
103
+ //! @tparam RandomAccessIterator
104
+ //! **[inferred]** The random-access iterator type for input iterator.
105
+ //!
106
+ //! @param[in] linear_tid
107
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
108
+ //! thread blocks)
109
+ //!
110
+ //! @param[in] block_src_it
111
+ //! The thread block's base iterator for loading from
112
+ //!
113
+ //! @param[out] dst_items
114
+ //! Destination to load data into
115
+ //!
116
+ //! @param[in] block_items_end
117
+ //! First out-of-bounds index when loading from block_src_it
118
+ template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
119
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
120
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
121
+ {
122
+ _CCCL_PRAGMA_UNROLL_FULL()
123
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
124
+ {
125
+ const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
126
+ if (src_pos < block_items_end)
127
+ {
128
+ dst_items[i] = block_src_it[src_pos];
129
+ }
130
+ }
131
+ }
132
+
133
+ //! @rst
134
+ //! Load a linear segment of items into a blocked arrangement across the thread block, guarded
135
+ //! by range, with a fall-back assignment of out-of-bound elements.
136
+ //!
137
+ //! @blocked
138
+ //!
139
+ //! @endrst
140
+ //!
141
+ //! @tparam T
142
+ //! **[inferred]** The data type to load.
143
+ //!
144
+ //! @tparam ITEMS_PER_THREAD
145
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
146
+ //!
147
+ //! @tparam RandomAccessIterator
148
+ //! **[inferred]** The random-access iterator type for input \iterator.
149
+ //!
150
+ //! @param[in] linear_tid
151
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
152
+ //! thread blocks)
153
+ //!
154
+ //! @param[in] block_src_it
155
+ //! The thread block's base input iterator for loading from
156
+ //!
157
+ //! @param[out] dst_items
158
+ //! Destination to load data into
159
+ //!
160
+ //! @param[in] block_items_end
161
+ //! First out-of-bounds index when loading from block_src_it
162
+ //!
163
+ //! @param[in] oob_default
164
+ //! Default value to assign out-of-bound items
165
+ template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
166
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
167
+ int linear_tid,
168
+ RandomAccessIterator block_src_it,
169
+ T (&dst_items)[ITEMS_PER_THREAD],
170
+ int block_items_end,
171
+ DefaultT oob_default)
172
+ {
173
+ _CCCL_PRAGMA_UNROLL_FULL()
174
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
175
+ {
176
+ dst_items[i] = oob_default;
177
+ }
178
+
179
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
180
+ }
181
+
182
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
183
+
184
+ //! @brief Internal implementation for load vectorization
185
+ //!
186
+ //! @param[in] linear_tid
187
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
188
+ //! thread blocks)
189
+ //!
190
+ //! @param[in] block_src_ptr
191
+ //! Input pointer for loading from
192
+ //!
193
+ //! @param[out] dst_items
194
+ //! Destination to load data into
195
+ template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
196
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
197
+ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
198
+ {
199
+ // Find biggest memory access word that T is a whole multiple of
200
+ using device_word_t = typename UnitWord<T>::DeviceWord;
201
+ _CCCL_DIAG_PUSH
202
+ # if _CCCL_COMPILER(CLANG, >=, 10)
203
+ _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div")
204
+ # endif // _CCCL_COMPILER(CLANG, >=, 10)
205
+ constexpr int total_words = static_cast<int>(sizeof(dst_items) / sizeof(device_word_t));
206
+ _CCCL_DIAG_POP
207
+ constexpr int vector_size = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1;
208
+ constexpr int vectors_per_thread = total_words / vector_size;
209
+
210
+ // Load into an array of vectors in thread-blocked order
211
+ using vector_t = typename CubVector<device_word_t, vector_size>::Type;
212
+
213
+ // Add the alignment check to ensure the vectorized loading can proceed.
214
+ if (reinterpret_cast<uintptr_t>(block_src_ptr) % (alignof(vector_t)) == 0)
215
+ {
216
+ vector_t vec_items[vectors_per_thread];
217
+ // Load into an array of vectors in thread-blocked order
218
+ const vector_t* vec_ptr = reinterpret_cast<const vector_t*>(block_src_ptr) + linear_tid * vectors_per_thread;
219
+
220
+ _CCCL_PRAGMA_UNROLL_FULL()
221
+ for (int i = 0; i < vectors_per_thread; i++)
222
+ {
223
+ vec_items[i] = ThreadLoad<MODIFIER>(vec_ptr + i);
224
+ }
225
+
226
+ // Copy to destination
227
+ _CCCL_PRAGMA_UNROLL_FULL()
228
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
229
+ {
230
+ dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
231
+ }
232
+ }
233
+ else
234
+ {
235
+ LoadDirectBlocked(linear_tid, block_src_ptr, dst_items);
236
+ }
237
+ }
238
+
239
+ #endif // _CCCL_DOXYGEN_INVOKED
240
+
241
+ //! @rst
242
+ //! Load a linear segment of items into a blocked arrangement across the thread block.
243
+ //!
244
+ //! @blocked
245
+ //!
246
+ //! The input offset (``block_ptr + block_offset``) must be quad-item aligned
247
+ //!
248
+ //! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
249
+ //!
250
+ //! - ``ITEMS_PER_THREAD`` is odd
251
+ //! - The data type ``T`` is not a built-in primitive or CUDA vector type
252
+ //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
253
+ //!
254
+ //! @endrst
255
+ //!
256
+ //! @tparam T
257
+ //! **[inferred]** The data type to load.
258
+ //!
259
+ //! @tparam ITEMS_PER_THREAD
260
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
261
+ //!
262
+ //! @param[in] linear_tid
263
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
264
+ //! linear_tid` for 2D thread blocks)
265
+ //!
266
+ //! @param[in] block_src_ptr
267
+ //! The thread block's base pointer for loading from
268
+ //!
269
+ //! @param[out] dst_items
270
+ //! destination to load data into
271
+ template <typename T, int ITEMS_PER_THREAD>
272
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
273
+ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
274
+ {
275
+ InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
276
+ }
277
+
278
+ //! @} end member group
279
+ //! @name Striped arrangement I/O (direct)
280
+ //! @{
281
+
282
+ //! @rst
283
+ //! Load a linear segment of items into a striped arrangement across the thread block.
284
+ //!
285
+ //! @striped
286
+ //!
287
+ //! @endrst
288
+ //!
289
+ //! @tparam BLOCK_THREADS
290
+ //! The thread block size in threads
291
+ //!
292
+ //! @tparam T
293
+ //! **[inferred]** The data type to load.
294
+ //!
295
+ //! @tparam ITEMS_PER_THREAD
296
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
297
+ //!
298
+ //! @tparam RandomAccessIterator
299
+ //! **[inferred]** The random-access iterator type for input iterator.
300
+ //!
301
+ //! @param[in] linear_tid
302
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
303
+ //! thread blocks)
304
+ //!
305
+ //! @param[in] block_src_it
306
+ //! The thread block's base iterator for loading from
307
+ //!
308
+ //! @param[out] dst_items
309
+ //! Destination to load data into
310
+ template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
311
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
312
+ LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
313
+ {
314
+ _CCCL_PRAGMA_UNROLL_FULL()
315
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
316
+ {
317
+ dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
318
+ }
319
+ }
320
+
321
+ namespace detail
322
+ {
323
+ template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
324
+ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
325
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
326
+ {
327
+ _CCCL_PRAGMA_UNROLL_FULL()
328
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
329
+ {
330
+ dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
331
+ }
332
+ }
333
+ } // namespace detail
334
+
335
+ //! @rst
336
+ //! Load a linear segment of items into a striped arrangement across the thread block, guarded by range
337
+ //!
338
+ //! @striped
339
+ //!
340
+ //! @endrst
341
+ //!
342
+ //! @tparam BLOCK_THREADS
343
+ //! The thread block size in threads
344
+ //!
345
+ //! @tparam T
346
+ //! **inferred** The data type to load.
347
+ //!
348
+ //! @tparam ITEMS_PER_THREAD
349
+ //! **inferred** The number of consecutive items partitioned onto each thread.
350
+ //!
351
+ //! @tparam RandomAccessIterator
352
+ //! **inferred** The random-access iterator type for input iterator.
353
+ //!
354
+ //! @param[in] linear_tid
355
+ //! A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) +
356
+ //! linear_tid</tt> for 2D thread blocks)
357
+ //!
358
+ //! @param[in] block_src_it
359
+ //! The thread block's base iterator for loading from
360
+ //!
361
+ //! @param[out] dst_items
362
+ //! Destination to load data into
363
+ //!
364
+ //! @param[in] block_items_end
365
+ //! Number of valid items to load
366
+ template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
367
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
368
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
369
+ {
370
+ _CCCL_PRAGMA_UNROLL_FULL()
371
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
372
+ {
373
+ const auto src_pos = linear_tid + i * BLOCK_THREADS;
374
+ if (src_pos < block_items_end)
375
+ {
376
+ dst_items[i] = block_src_it[src_pos];
377
+ }
378
+ }
379
+ }
380
+
381
+ //! @rst
382
+ //! Load a linear segment of items into a striped arrangement across the thread block, guarded
383
+ //! by range, with a fall-back assignment of out-of-bound elements.
384
+ //!
385
+ //! @striped
386
+ //!
387
+ //! @endrst
388
+ //!
389
+ //! @tparam BLOCK_THREADS
390
+ //! The thread block size in threads
391
+ //!
392
+ //! @tparam T
393
+ //! **inferred** The data type to load.
394
+ //!
395
+ //! @tparam ITEMS_PER_THREAD
396
+ //! **inferred** The number of consecutive items partitioned onto each thread.
397
+ //!
398
+ //! @tparam RandomAccessIterator
399
+ //! **inferred** The random-access iterator type for input \iterator.
400
+ //!
401
+ //! @param[in] linear_tid
402
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
403
+ //! linear_tid` for 2D thread blocks)
404
+ //!
405
+ //! @param[in] block_src_it
406
+ //! The thread block's base iterator for loading from
407
+ //!
408
+ //! @param[out] dst_items
409
+ //! Destination to load data into
410
+ //!
411
+ //! @param[in] block_items_end
412
+ //! Number of valid items to load
413
+ //!
414
+ //! @param[in] oob_default
415
+ //! Default value to assign out-of-bound items
416
+ template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
417
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
418
+ int linear_tid,
419
+ RandomAccessIterator block_src_it,
420
+ T (&dst_items)[ITEMS_PER_THREAD],
421
+ int block_items_end,
422
+ DefaultT oob_default)
423
+ {
424
+ _CCCL_PRAGMA_UNROLL_FULL()
425
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
426
+ {
427
+ dst_items[i] = oob_default;
428
+ }
429
+
430
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
431
+ }
432
+
433
+ //! @} end member group
434
+ //! @name Warp-striped arrangement I/O (direct)
435
+ //! @{
436
+
437
+ //! @rst
438
+ //! Load a linear segment of items into a warp-striped arrangement across the thread block.
439
+ //!
440
+ //! @warpstriped
441
+ //!
442
+ //! Usage Considerations
443
+ //! ++++++++++++++++++++
444
+ //!
445
+ //! The number of threads in the thread block must be a multiple of the architecture's warp size.
446
+ //!
447
+ //! @endrst
448
+ //!
449
+ //! @tparam T
450
+ //! **inferred** The data type to load.
451
+ //!
452
+ //! @tparam ITEMS_PER_THREAD
453
+ //! **inferred** The number of consecutive items partitioned onto each thread.
454
+ //!
455
+ //! @tparam RandomAccessIterator
456
+ //! **inferred** The random-access iterator type for input iterator.
457
+ //!
458
+ //! @param[in] linear_tid
459
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
460
+ //! linear_tid` for 2D thread blocks)
461
+ //!
462
+ //! @param[in] block_src_it
463
+ //! The thread block's base iterator for loading from
464
+ //!
465
+ //! @param[out] dst_items
466
+ //! Destination to load data into
467
+ template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
468
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
469
+ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
470
+ {
471
+ const int tid = linear_tid & (detail::warp_threads - 1);
472
+ const int wid = linear_tid >> detail::log2_warp_threads;
473
+ const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
474
+
475
+ // Load directly in warp-striped order
476
+ _CCCL_PRAGMA_UNROLL_FULL()
477
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
478
+ {
479
+ new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
480
+ }
481
+ }
482
+
483
+ //! @rst
484
+ //! Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
485
+ //!
486
+ //! @warpstriped
487
+ //!
488
+ //! Usage Considerations
489
+ //! ++++++++++++++++++++
490
+ //!
491
+ //! The number of threads in the thread block must be a multiple of the architecture's warp size.
492
+ //!
493
+ //! @endrst
494
+ //!
495
+ //! @tparam T
496
+ //! **inferred** The data type to load.
497
+ //!
498
+ //! @tparam ITEMS_PER_THREAD
499
+ //! **inferred** The number of consecutive items partitioned onto each thread.
500
+ //!
501
+ //! @tparam RandomAccessIterator
502
+ //! **inferred** The random-access iterator type for input \iterator.
503
+ //!
504
+ //! @param[in] linear_tid
505
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
506
+ //! linear_tid` for 2D thread blocks)
507
+ //!
508
+ //! @param[in] block_src_it
509
+ //! The thread block's base iterator for loading from
510
+ //!
511
+ //! @param[out] dst_items
512
+ //! Destination to load data into
513
+ //!
514
+ //! @param[in] block_items_end
515
+ //! Number of valid items to load
516
+ template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
517
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
518
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
519
+ {
520
+ const int tid = linear_tid & (detail::warp_threads - 1);
521
+ const int wid = linear_tid >> detail::log2_warp_threads;
522
+ const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
523
+
524
+ // Load directly in warp-striped order
525
+ _CCCL_PRAGMA_UNROLL_FULL()
526
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
527
+ {
528
+ const auto src_pos = warp_offset + tid + (i * detail::warp_threads);
529
+ if (src_pos < block_items_end)
530
+ {
531
+ new (&dst_items[i]) T(block_src_it[src_pos]);
532
+ }
533
+ }
534
+ }
535
+
536
+ //! @rst
537
+ //! Load a linear segment of items into a warp-striped arrangement across the thread block,
538
+ //! guarded by range, with a fall-back assignment of out-of-bound elements.
539
+ //!
540
+ //! @warpstriped
541
+ //!
542
+ //! @endrst
543
+ //!
544
+ //! Usage Considerations
545
+ //! ++++++++++++++++++++
546
+ //!
547
+ //! The number of threads in the thread block must be a multiple of the architecture's warp size.
548
+ //!
549
+ //! @tparam T
550
+ //! **inferred** The data type to load.
551
+ //!
552
+ //! @tparam ITEMS_PER_THREAD
553
+ //! **inferred** The number of consecutive items partitioned onto each thread.
554
+ //!
555
+ //! @tparam RandomAccessIterator
556
+ //! **inferred** The random-access iterator type for input \iterator.
557
+ //!
558
+ //! @param[in] linear_tid
559
+ //! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
560
+ //! linear_tid` for 2D thread blocks)
561
+ //!
562
+ //! @param[in] block_src_it
563
+ //! The thread block's base iterator for loading from
564
+ //!
565
+ //! @param[out] dst_items
566
+ //! Destination to load data into
567
+ //!
568
+ //! @param[in] block_items_end
569
+ //! Number of valid items to load
570
+ //!
571
+ //! @param[in] oob_default
572
+ //! Default value to assign out-of-bound items
573
+ template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
574
+ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
575
+ int linear_tid,
576
+ RandomAccessIterator block_src_it,
577
+ T (&dst_items)[ITEMS_PER_THREAD],
578
+ int block_items_end,
579
+ DefaultT oob_default)
580
+ {
581
+ // Load directly in warp-striped order
582
+ _CCCL_PRAGMA_UNROLL_FULL()
583
+ for (int i = 0; i < ITEMS_PER_THREAD; i++)
584
+ {
585
+ dst_items[i] = oob_default;
586
+ }
587
+
588
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
589
+ }
590
+
591
+ //! @} end member group
592
+
593
+ //! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data
594
+ //! from memory into a blocked arrangement across a CUDA thread block.
595
+ enum BlockLoadAlgorithm
596
+ {
597
+ //! @rst
598
+ //! Overview
599
+ //! ++++++++++++++++++++++++++
600
+ //!
601
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
602
+ //!
603
+ //! Performance Considerations
604
+ //! ++++++++++++++++++++++++++
605
+ //!
606
+ //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases
607
+ //! (i.e., the number items per thread).
608
+ //! @endrst
609
+ BLOCK_LOAD_DIRECT,
610
+
611
+ //! @rst
612
+ //! Overview
613
+ //! ++++++++++++++++++++++++++
614
+ //!
615
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
616
+ //!
617
+ //! Performance Considerations
618
+ //! ++++++++++++++++++++++++++
619
+ //!
620
+ //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread.
621
+ //!
622
+ //! @endrst
623
+ BLOCK_LOAD_STRIPED,
624
+
625
+ //! @rst
626
+ //! Overview
627
+ //! ++++++++++++++++++++++++++
628
+ //!
629
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
630
+ //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
631
+ //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
632
+ //!
633
+ //! Performance Considerations
634
+ //! ++++++++++++++++++++++++++
635
+ //!
636
+ //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads
637
+ //! (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever
638
+ //! is lower).
639
+ //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
640
+ //!
641
+ //! - ``ITEMS_PER_THREAD`` is odd
642
+ //! - The ``RandomAccessIterator`` is not a simple pointer type
643
+ //! - The block input offset is not quadword-aligned
644
+ //! - The data type ``T`` is not a built-in primitive or CUDA vector type
645
+ //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
646
+ //!
647
+ //! @endrst
648
+ BLOCK_LOAD_VECTORIZE,
649
+
650
+ //! @rst
651
+ //! Overview
652
+ //! ++++++++++++++++++++++++++
653
+ //!
654
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then locally
655
+ //! transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
656
+ //!
657
+ //! Performance Considerations
658
+ //! ++++++++++++++++++++++++++
659
+ //!
660
+ //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
661
+ //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and
662
+ //! cub::BLOCK_LOAD_VECTORIZE alternatives.
663
+ //!
664
+ //! @endrst
665
+ BLOCK_LOAD_TRANSPOSE,
666
+
667
+ //! @rst
668
+ //! Overview
669
+ //! ++++++++++++++++++++++++++
670
+ //!
671
+ //! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
672
+ //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
673
+ //!
674
+ //! Usage Considerations
675
+ //! ++++++++++++++++++++++++++
676
+ //!
677
+ //! - BLOCK_THREADS must be a multiple of WARP_THREADS
678
+ //!
679
+ //! Performance Considerations
680
+ //! ++++++++++++++++++++++++++
681
+ //!
682
+ //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
683
+ //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and
684
+ //! cub::BLOCK_LOAD_VECTORIZE alternatives.
685
+ //! - Provisions more shared storage, but incurs smaller latencies than the
686
+ //! BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
687
+ //!
688
+ //! @endrst
689
+ BLOCK_LOAD_WARP_TRANSPOSE,
690
+
691
+ //! @rst
692
+ //! Overview
693
+ //! ++++++++++++++++++++++++++
694
+ //!
695
+ //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read
696
+ //! directly from memory and then is locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
697
+ //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently
698
+ //! time-sliced among warps.
699
+ //!
700
+ //! Usage Considerations
701
+ //! ++++++++++++++++++++++++++
702
+ //!
703
+ //! - BLOCK_THREADS must be a multiple of WARP_THREADS
704
+ //!
705
+ //! Performance Considerations
706
+ //! ++++++++++++++++++++++++++
707
+ //!
708
+ //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
709
+ //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE
710
+ //! alternative.
711
+ //!
712
+ //! @endrst
713
+ BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
714
+ };
715
+
716
+ //! @rst
717
+ //! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
718
+ //! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread
719
+ //! block.
720
+ //!
721
+ //! Overview
722
+ //! +++++++++++++++++++++++++++++++++++++++++++++
723
+ //!
724
+ //! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different
725
+ //! cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures,
726
+ //! data types, granularity sizes, etc.
727
+ //! - BlockLoad can be optionally specialized by different data movement strategies:
728
+ //!
729
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
730
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
731
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_STRIPED`:
732
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
733
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_VECTORIZE`:
734
+ //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory
735
+ //! using CUDA's built-in vectorized loads as a coalescing optimization.
736
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`:
737
+ //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
738
+ //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
739
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`:
740
+ //! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
741
+ //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
742
+ //! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED`:
743
+ //! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
744
+ //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>` one warp at a time.
745
+ //!
746
+ //! - @rowmajor
747
+ //!
748
+ //! A Simple Example
749
+ //! +++++++++++++++++++++++++++++++++++++++++++++
750
+ //!
751
+ //! @blockcollective{BlockLoad}
752
+ //!
753
+ //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
754
+ //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
755
+ //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
756
+ //! pattern (after which items are locally reordered among threads).
757
+ //!
758
+ //! .. code-block:: c++
759
+ //!
760
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
761
+ //!
762
+ //! __global__ void ExampleKernel(int *d_data, ...)
763
+ //! {
764
+ //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
765
+ //! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
766
+ //!
767
+ //! // Allocate shared memory for BlockLoad
768
+ //! __shared__ typename BlockLoad::TempStorage temp_storage;
769
+ //!
770
+ //! // Load a segment of consecutive items that are blocked across threads
771
+ //! int thread_data[4];
772
+ //! BlockLoad(temp_storage).Load(d_data, thread_data);
773
+ //!
774
+ //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
775
+ //! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
776
+ //!
777
+ //! Re-using dynamically allocating shared memory
778
+ //! +++++++++++++++++++++++++++++++++++++++++++++
779
+ //!
780
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
781
+ //! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
782
+ //! by BlockLoad.
783
+ //!
784
+ //! @endrst
785
+ //!
786
+ //! @tparam T
787
+ // The data type to read into (which must be convertible from the input iterator's value type).
788
+ //!
789
+ //! @tparam BLOCK_DIM_X
790
+ //! The thread block length in threads along the X dimension
791
+ //!
792
+ //! @tparam ITEMS_PER_THREAD
793
+ //! The number of consecutive items partitioned onto each thread.
794
+ //!
795
+ //! @tparam ALGORITHM
796
+ //! **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
797
+ //!
798
+ //! @tparam BLOCK_DIM_Y
799
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
800
+ //!
801
+ //! @tparam BLOCK_DIM_Z
802
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
803
+ //!
804
+ template <typename T,
805
+ int BLOCK_DIM_X,
806
+ int ITEMS_PER_THREAD,
807
+ BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
808
+ int BLOCK_DIM_Y = 1,
809
+ int BLOCK_DIM_Z = 1>
810
+ class BlockLoad
811
+ {
812
+ static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
813
+
814
+ template <BlockLoadAlgorithm _POLICY, int DUMMY>
815
+ struct LoadInternal; // helper to dispatch the load algorithm
816
+
817
+ template <int DUMMY>
818
+ struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
819
+ {
820
+ using TempStorage = NullType;
821
+ int linear_tid;
822
+
823
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
824
+ : linear_tid(linear_tid)
825
+ {}
826
+
827
+ template <typename RandomAccessIterator>
828
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
829
+ {
830
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items);
831
+ }
832
+
833
+ template <typename RandomAccessIterator>
834
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
835
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
836
+ {
837
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
838
+ }
839
+
840
+ template <typename RandomAccessIterator, typename DefaultT>
841
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
842
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
843
+ {
844
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
845
+ }
846
+ };
847
+
848
+ template <int DUMMY>
849
+ struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
850
+ {
851
+ using TempStorage = NullType;
852
+ int linear_tid;
853
+
854
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
855
+ : linear_tid(linear_tid)
856
+ {}
857
+
858
+ template <typename RandomAccessIterator>
859
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
860
+ {
861
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
862
+ }
863
+
864
+ template <typename RandomAccessIterator>
865
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
866
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
867
+ {
868
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
869
+ }
870
+
871
+ template <typename RandomAccessIterator, typename DefaultT>
872
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
873
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
874
+ {
875
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
876
+ }
877
+ };
878
+
879
+ template <int DUMMY>
880
+ struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
881
+ {
882
+ using TempStorage = NullType;
883
+ int linear_tid;
884
+
885
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
886
+ : linear_tid(linear_tid)
887
+ {}
888
+
889
+ // attempts vectorization (pointer)
890
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
891
+ {
892
+ InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
893
+ }
894
+ // NOTE: This function is necessary for pointers to non-const types.
895
+ // The core reason is that the compiler will not deduce 'T*' to 'const T*' automatically.
896
+ // Otherwise, when the pointer type is 'T*', the compiler will prefer the overloaded version
897
+ // Load(RandomAccessIterator...) over Load(const T*...), which means it will never perform vectorized loading for
898
+ // pointers to non-const types.
899
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
900
+ {
901
+ InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
902
+ }
903
+
904
+ // any other iterator, no vectorization
905
+ template <typename RandomAccessIterator>
906
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
907
+ {
908
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items);
909
+ }
910
+
911
+ // attempts vectorization (cache modified iterator)
912
+ template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
913
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
914
+ Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
915
+ {
916
+ InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
917
+ }
918
+
919
+ // skips vectorization
920
+ template <typename RandomAccessIterator>
921
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
922
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
923
+ {
924
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
925
+ }
926
+
927
+ // skips vectorization
928
+ template <typename RandomAccessIterator, typename DefaultT>
929
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
930
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
931
+ {
932
+ LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
933
+ }
934
+ };
935
+
936
+ template <int DUMMY>
937
+ struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
938
+ {
939
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
940
+ using _TempStorage = typename BlockExchange::TempStorage;
941
+ using TempStorage = Uninitialized<_TempStorage>;
942
+
943
+ _TempStorage& temp_storage;
944
+ int linear_tid;
945
+
946
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
947
+ : temp_storage(temp_storage.Alias())
948
+ , linear_tid(linear_tid)
949
+ {}
950
+
951
+ template <typename RandomAccessIterator>
952
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
953
+ {
954
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
955
+ BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
956
+ }
957
+
958
+ template <typename RandomAccessIterator>
959
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
960
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
961
+ {
962
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
963
+ BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
964
+ }
965
+
966
+ template <typename RandomAccessIterator, typename DefaultT>
967
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
968
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
969
+ {
970
+ LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
971
+ BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
972
+ }
973
+ };
974
+
975
+ template <int DUMMY>
976
+ struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
977
+ {
978
+ static constexpr int WARP_THREADS = detail::warp_threads;
979
+ static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
980
+
981
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
982
+ using _TempStorage = typename BlockExchange::TempStorage;
983
+ using TempStorage = Uninitialized<_TempStorage>;
984
+
985
+ _TempStorage& temp_storage;
986
+ int linear_tid;
987
+
988
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
989
+ : temp_storage(temp_storage.Alias())
990
+ , linear_tid(linear_tid)
991
+ {}
992
+
993
+ template <typename RandomAccessIterator>
994
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
995
+ {
996
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
997
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
998
+ }
999
+
1000
+ template <typename RandomAccessIterator>
1001
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1002
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
1003
+ {
1004
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
1005
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1006
+ }
1007
+
1008
+ template <typename RandomAccessIterator, typename DefaultT>
1009
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1010
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
1011
+ {
1012
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
1013
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1014
+ }
1015
+ };
1016
+
1017
+ template <int DUMMY>
1018
+ struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
1019
+ {
1020
+ static constexpr int WARP_THREADS = detail::warp_threads;
1021
+ static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
1022
+
1023
+ using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
1024
+ using _TempStorage = typename BlockExchange::TempStorage;
1025
+ using TempStorage = Uninitialized<_TempStorage>;
1026
+
1027
+ _TempStorage& temp_storage;
1028
+ int linear_tid;
1029
+
1030
+ _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
1031
+ : temp_storage(temp_storage.Alias())
1032
+ , linear_tid(linear_tid)
1033
+ {}
1034
+
1035
+ template <typename RandomAccessIterator>
1036
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
1037
+ {
1038
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
1039
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1040
+ }
1041
+
1042
+ template <typename RandomAccessIterator>
1043
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1044
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
1045
+ {
1046
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
1047
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1048
+ }
1049
+
1050
+ template <typename RandomAccessIterator, typename DefaultT>
1051
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1052
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
1053
+ {
1054
+ LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
1055
+ BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1056
+ }
1057
+ };
1058
+
1059
+ using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
1060
+ using _TempStorage = typename InternalLoad::TempStorage;
1061
+
1062
+ // Internal storage allocator
1063
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
1064
+ {
1065
+ __shared__ _TempStorage private_storage;
1066
+ return private_storage;
1067
+ }
1068
+
1069
+ _TempStorage& temp_storage;
1070
+ int linear_tid;
1071
+
1072
+ public:
1073
+ /// @smemstorage{BlockLoad}
1074
+ using TempStorage = Uninitialized<_TempStorage>;
1075
+
1076
+ //! @name Collective constructors
1077
+ //! @{
1078
+
1079
+ /// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
1080
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
1081
+ : temp_storage(PrivateStorage())
1082
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1083
+ {}
1084
+
1085
+ /// @brief Collective constructor using the specified memory allocation as temporary storage.
1086
+ /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
1087
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
1088
+ : temp_storage(temp_storage.Alias())
1089
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1090
+ {}
1091
+
1092
+ //! @} end member group
1093
+ //! @name Data movement
1094
+ //! @{
1095
+
1096
+ //! @rst
1097
+ //! Load a linear segment of items from memory.
1098
+ //!
1099
+ //! - @blocked
1100
+ //! - @smemreuse
1101
+ //!
1102
+ //! Snippet
1103
+ //! +++++++
1104
+ //!
1105
+ //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
1106
+ //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
1107
+ //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
1108
+ //! pattern (after which items are locally reordered among threads).
1109
+ //!
1110
+ //! .. code-block:: c++
1111
+ //!
1112
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1113
+ //!
1114
+ //! __global__ void ExampleKernel(int *d_data, ...)
1115
+ //! {
1116
+ //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1117
+ //! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
1118
+ //!
1119
+ //! // Allocate shared memory for BlockLoad
1120
+ //! __shared__ typename BlockLoad::TempStorage temp_storage;
1121
+ //!
1122
+ //! // Load a segment of consecutive items that are blocked across threads
1123
+ //! int thread_data[4];
1124
+ //! BlockLoad(temp_storage).Load(d_data, thread_data);
1125
+ //!
1126
+ //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
1127
+ //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
1128
+ //!
1129
+ //! @endrst
1130
+ //!
1131
+ //! @param[in] block_src_it
1132
+ //! The thread block's base iterator for loading from
1133
+ //!
1134
+ //! @param[out] dst_items
1135
+ //! Destination to load data into
1136
+ template <typename RandomAccessIterator>
1137
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
1138
+ {
1139
+ InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
1140
+ }
1141
+
1142
+ //! @rst
1143
+ //!
1144
+ //! Load a linear segment of items from memory, guarded by range.
1145
+ //!
1146
+ //! - @blocked
1147
+ //! - @smemreuse
1148
+ //!
1149
+ //! Snippet
1150
+ //! +++++++
1151
+ //!
1152
+ //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
1153
+ //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
1154
+ //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
1155
+ //! pattern (after which items are locally reordered among threads).
1156
+ //!
1157
+ //! .. code-block:: c++
1158
+ //!
1159
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1160
+ //!
1161
+ //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
1162
+ //! {
1163
+ //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1164
+ //! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
1165
+ //!
1166
+ //! // Allocate shared memory for BlockLoad
1167
+ //! __shared__ typename BlockLoad::TempStorage temp_storage;
1168
+ //!
1169
+ //! // Load a segment of consecutive items that are blocked across threads
1170
+ //! int thread_data[4];
1171
+ //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
1172
+ //!
1173
+ //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
1174
+ //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
1175
+ //! with only the first two threads being unmasked to load portions of valid data (and other items remaining
1176
+ //! unassigned).
1177
+ //!
1178
+ //! @endrst
1179
+ //!
1180
+ //! @param[in] block_src_it
1181
+ //! The thread block's base iterator for loading from
1182
+ //!
1183
+ //! @param[out] dst_items
1184
+ //! Destination to load data into
1185
+ //!
1186
+ //! @param[in] block_items_end
1187
+ //! Number of valid items to load
1188
+ template <typename RandomAccessIterator>
1189
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1190
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
1191
+ {
1192
+ InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
1193
+ }
1194
+
1195
+ //! @rst
1196
+ //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
1197
+ //!
1198
+ //! - @blocked
1199
+ //! - @smemreuse
1200
+ //!
1201
+ //! Snippet
1202
+ //! +++++++
1203
+ //!
1204
+ //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
1205
+ //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
1206
+ //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
1207
+ //! pattern (after which items are locally reordered among threads).
1208
+ //!
1209
+ //! .. code-block:: c++
1210
+ //!
1211
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
1212
+ //!
1213
+ //! __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
1214
+ //! {
1215
+ //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
1216
+ //! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
1217
+ //!
1218
+ //! // Allocate shared memory for BlockLoad
1219
+ //! __shared__ typename BlockLoad::TempStorage temp_storage;
1220
+ //!
1221
+ //! // Load a segment of consecutive items that are blocked across threads
1222
+ //! int thread_data[4];
1223
+ //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
1224
+ //!
1225
+ //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
1226
+ //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
1227
+ //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load
1228
+ //! portions of valid data (and other items are assigned ``-1``)
1229
+ //!
1230
+ //! @endrst
1231
+ //!
1232
+ //! @param[in] block_src_it
1233
+ //! The thread block's base iterator for loading from
1234
+ //!
1235
+ //! @param[out] dst_items
1236
+ //! Destination to load data into
1237
+ //!
1238
+ //! @param[in] block_items_end
1239
+ //! Number of valid items to load
1240
+ //!
1241
+ //! @param[in] oob_default
1242
+ //! Default value to assign out-of-bound items
1243
+ template <typename RandomAccessIterator, typename DefaultT>
1244
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1245
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
1246
+ {
1247
+ InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
1248
+ }
1249
+
1250
+ //! @} end member group
1251
+ };
1252
+
1253
+ template <class Policy, class It, class T = cub::detail::it_value_t<It>>
1254
+ struct BlockLoadType
1255
+ {
1256
+ using type = cub::BlockLoad<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::LOAD_ALGORITHM>;
1257
+ };
1258
+
1259
+ CUB_NAMESPACE_END