cuda-cccl 0.1.3.1.0.dev1678__cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1860) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +273 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +935 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
  241. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  242. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
  243. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
  244. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
  245. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  247. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  248. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  249. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  259. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  260. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  261. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  262. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  263. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  264. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  265. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  266. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  267. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  268. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
  269. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  270. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  271. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  272. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  273. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  274. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  275. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  276. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  384. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  385. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  386. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  387. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
  388. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  389. cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
  390. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  391. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  392. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  393. cuda/cccl/headers/include/cuda/access_property +26 -0
  394. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  395. cuda/cccl/headers/include/cuda/atomic +27 -0
  396. cuda/cccl/headers/include/cuda/barrier +262 -0
  397. cuda/cccl/headers/include/cuda/bit +29 -0
  398. cuda/cccl/headers/include/cuda/cmath +35 -0
  399. cuda/cccl/headers/include/cuda/discard_memory +60 -0
  400. cuda/cccl/headers/include/cuda/functional +31 -0
  401. cuda/cccl/headers/include/cuda/iterator +34 -0
  402. cuda/cccl/headers/include/cuda/latch +27 -0
  403. cuda/cccl/headers/include/cuda/mdspan +28 -0
  404. cuda/cccl/headers/include/cuda/memory +32 -0
  405. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  406. cuda/cccl/headers/include/cuda/numeric +28 -0
  407. cuda/cccl/headers/include/cuda/pipeline +577 -0
  408. cuda/cccl/headers/include/cuda/ptx +124 -0
  409. cuda/cccl/headers/include/cuda/semaphore +31 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  517. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  518. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  519. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  520. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  521. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  522. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  523. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  524. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  525. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  526. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  527. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  530. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  531. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  532. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  533. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  534. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  535. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  536. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
  537. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  555. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  556. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  557. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  558. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  559. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  560. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  561. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  562. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
  563. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  564. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  565. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  566. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  567. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
  583. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  584. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
  585. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  586. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  587. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  588. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  589. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  590. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  591. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  592. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  593. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  594. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  595. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  596. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
  597. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  598. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
  599. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  600. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  601. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
  602. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  603. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  604. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  605. cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
  606. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  607. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  608. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
  609. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  610. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  611. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  612. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  613. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  614. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  615. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
  616. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  617. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  618. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  619. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  620. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  621. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  622. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  623. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  624. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  625. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  627. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  628. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  629. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  630. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  631. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  632. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  633. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  634. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  635. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  636. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  637. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  638. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  639. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  640. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
  641. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  642. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  643. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  644. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
  645. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
  646. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  647. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
  648. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  650. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  651. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  652. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  653. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  654. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  655. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  656. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  657. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  660. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  661. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  662. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  663. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  664. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
  665. cuda/cccl/headers/include/cuda/std/__format_ +28 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  667. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  668. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  669. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  670. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  671. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  672. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  673. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  674. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  675. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  676. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  677. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  678. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
  679. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  680. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  681. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  682. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  683. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  684. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  685. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  686. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  687. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  688. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  689. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  690. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  691. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  692. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  693. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  694. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  696. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  697. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  698. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  699. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  700. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  701. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  702. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  703. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  704. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  705. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  706. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  707. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  708. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  709. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  710. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  711. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  712. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  713. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  724. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  725. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
  726. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  727. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  728. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
  729. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
  731. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  732. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  733. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  734. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  735. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  736. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
  737. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  739. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
  740. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  741. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  742. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  743. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  744. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  745. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  746. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
  747. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  748. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  749. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  751. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  752. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  753. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  754. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  755. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  756. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  757. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  758. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
  759. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
  760. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  761. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  762. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
  763. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  764. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
  765. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
  766. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  767. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
  768. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  769. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  770. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  771. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  772. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  773. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  774. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  775. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  776. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  777. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  778. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
  779. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  780. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  781. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
  782. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  783. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  784. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  785. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  786. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  787. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  788. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
  789. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
  790. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  791. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  792. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  793. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  794. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  795. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  796. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  797. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  798. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  799. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  800. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  801. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  802. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  803. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  804. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  805. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  806. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  807. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  808. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  809. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  810. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
  811. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  812. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  813. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  814. cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
  815. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
  816. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  817. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  818. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  819. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  820. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
  821. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  822. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  823. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  824. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  825. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  826. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  827. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  828. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  829. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  830. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  831. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  832. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  833. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  834. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  835. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  836. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  837. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
  839. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
  840. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  841. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
  842. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  843. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  844. cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
  845. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  846. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
  847. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  848. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  849. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  850. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  851. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  852. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  853. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  854. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  855. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  856. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  857. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  858. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  859. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  860. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  861. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  862. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  863. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  864. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
  865. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  866. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  867. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  868. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  869. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  870. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  871. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  872. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  873. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  874. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  875. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  876. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1016. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1017. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  1018. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1019. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1020. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1021. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1022. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1023. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1024. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1025. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1026. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1027. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1028. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1029. cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
  1030. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1031. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
  1032. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1034. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1035. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1036. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1037. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1038. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/array +520 -0
  1040. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1041. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  1042. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1043. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1044. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1045. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1046. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1047. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1048. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1049. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1050. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1051. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1052. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1053. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1054. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1055. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1056. cuda/cccl/headers/include/cuda/std/ctime +152 -0
  1057. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1058. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1059. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
  1060. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
  1061. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
  1062. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1063. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1064. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
  1065. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
  1066. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1067. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1068. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1069. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1070. cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
  1071. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1072. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1073. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1074. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1075. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1076. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1077. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1078. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1079. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1080. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1081. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1082. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1083. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1084. cuda/cccl/headers/include/cuda/std/span +640 -0
  1085. cuda/cccl/headers/include/cuda/std/string_view +788 -0
  1086. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1087. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1088. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1089. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1090. cuda/cccl/headers/include/cuda/std/version +245 -0
  1091. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1092. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1093. cuda/cccl/headers/include/cuda/utility +27 -0
  1094. cuda/cccl/headers/include/cuda/version +16 -0
  1095. cuda/cccl/headers/include/cuda/warp +28 -0
  1096. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1097. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1098. cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
  1099. cuda/cccl/headers/include/nv/target +240 -0
  1100. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1101. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1102. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1103. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1104. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1105. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1106. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1107. cuda/cccl/headers/include/thrust/count.h +245 -0
  1108. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1109. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1110. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1111. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1112. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1113. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1114. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1115. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1116. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1117. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1118. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1119. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1120. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1121. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1122. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1123. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1124. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1125. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1126. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1127. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1128. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1129. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1130. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1131. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1132. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1133. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1134. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1135. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1136. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1137. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1138. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1139. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1140. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1141. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1142. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1143. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1144. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1145. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1146. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1147. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1148. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1149. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1150. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1151. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1152. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1153. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1154. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1155. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1156. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1157. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1158. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1159. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1160. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1161. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1162. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1163. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1164. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1165. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1166. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1167. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1168. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1169. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1170. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1171. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1172. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1173. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1174. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1175. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1176. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1177. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1178. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1179. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1180. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1181. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1182. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1183. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1184. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1185. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1186. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1187. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1188. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1189. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1190. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1191. cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
  1192. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1193. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1194. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1195. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1196. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1197. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1198. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1199. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1200. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1201. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1202. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1203. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1204. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1205. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1206. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1207. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1208. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1209. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1210. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1211. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1212. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1213. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1214. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1215. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1216. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1217. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1218. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1219. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1220. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1221. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1222. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1223. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1224. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1225. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1226. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1227. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1228. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1229. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1230. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1231. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1232. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1233. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1234. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1235. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1236. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1237. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1238. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1239. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1240. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1241. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1242. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1243. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1244. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1245. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1246. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1247. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1248. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1249. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1250. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1251. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1252. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1254. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1255. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1256. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1257. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1258. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1259. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1260. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1261. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1262. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1263. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1264. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1265. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1266. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1267. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1268. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1269. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1270. cuda/cccl/headers/include/thrust/find.h +382 -0
  1271. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1272. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1273. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1274. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1275. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1276. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1277. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1278. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1279. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1280. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1281. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1282. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1283. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1284. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1285. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1286. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1287. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1288. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1289. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1290. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1291. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1292. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1293. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1294. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1295. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1296. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1297. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1298. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
  1299. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1300. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1301. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1302. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1303. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1304. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1305. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1306. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1307. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1308. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1309. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1310. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1311. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1312. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1313. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1314. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1315. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1316. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1317. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1318. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1319. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1320. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1321. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1322. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1323. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1324. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1325. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1326. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1327. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1328. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1329. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1330. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1331. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1332. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1333. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1334. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1335. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1336. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1337. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1338. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1339. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1340. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1341. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1342. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1343. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1344. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1345. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1346. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1347. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1348. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1349. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1350. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1351. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1352. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1353. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1354. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1355. cuda/cccl/headers/include/thrust/random.h +120 -0
  1356. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1357. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1358. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1359. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1360. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1361. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1362. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1363. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1364. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1365. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1366. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1377. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1378. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1379. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1380. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1382. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1383. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1384. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1385. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1386. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1388. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1389. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1390. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1391. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1392. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1393. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1394. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1395. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1396. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1397. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1398. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1399. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1400. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1401. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1402. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1403. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1404. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1405. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1406. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1408. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1409. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1410. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1411. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1412. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1413. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1414. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1415. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1416. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1417. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1418. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1446. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
  1447. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1448. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1449. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1450. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1451. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1452. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1453. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1454. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1455. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1456. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1457. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1458. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1459. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
  1460. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1461. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1462. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1463. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1464. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1465. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1466. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1467. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1469. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1470. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1471. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1472. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1473. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1474. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1475. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1476. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1477. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1478. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1479. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
  1480. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1481. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1482. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1483. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1484. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1485. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1486. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1487. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1675. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1702. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1703. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1704. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1705. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1706. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1708. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1709. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1710. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1711. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1712. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1713. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1714. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1715. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1716. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1717. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1718. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1724. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1725. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1726. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1727. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1728. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1730. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1731. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1732. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1733. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1734. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1735. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1736. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1737. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1738. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1739. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1740. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1742. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1767. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1768. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1769. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1770. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1771. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1772. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1773. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1775. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1776. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1777. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1778. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1779. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1780. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1783. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1784. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1785. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1786. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1788. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1789. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1790. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1807. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1808. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1809. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1810. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1811. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1812. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1813. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1814. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1815. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1816. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1817. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1818. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1819. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1820. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1821. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1822. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1823. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1824. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1825. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1826. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1827. cuda/cccl/headers/include/thrust/version.h +93 -0
  1828. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1829. cuda/cccl/headers/include_paths.py +72 -0
  1830. cuda/cccl/parallel/__init__.py +9 -0
  1831. cuda/cccl/parallel/experimental/__init__.py +47 -0
  1832. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1833. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1834. cuda/cccl/parallel/experimental/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1835. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1836. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1837. cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
  1838. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1839. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1840. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1841. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1842. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1843. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1844. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1845. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1846. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1847. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1848. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1849. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1850. cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
  1851. cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
  1852. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1853. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1854. cuda/cccl/parallel/experimental/struct.py +150 -0
  1855. cuda/cccl/parallel/experimental/typing.py +27 -0
  1856. cuda/cccl/py.typed +0 -0
  1857. cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
  1858. cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
  1859. cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
  1860. cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2583 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
31
+ //! sum/scan of items partitioned across a CUDA thread block.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/block/specializations/block_scan_raking.cuh>
46
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
47
+ #include <cub/util_ptx.cuh>
48
+ #include <cub/util_type.cuh>
49
+
50
+ #include <cuda/std/type_traits>
51
+
52
+ CUB_NAMESPACE_BEGIN
53
+
54
+ /******************************************************************************
55
+ * Algorithmic variants
56
+ ******************************************************************************/
57
+
58
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
59
+ //! parallel prefix scan across a CUDA thread block.
60
+ enum BlockScanAlgorithm
61
+ {
62
+
63
+ //! @rst
64
+ //! Overview
65
+ //! ++++++++++++++++++++++++++
66
+ //!
67
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
68
+ //!
69
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
70
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
71
+ //! #. Upsweep sequential reduction in shared memory.
72
+ //! Threads within a single warp rake across segments of shared partial reductions.
73
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
74
+ //! #. Downsweep sequential exclusive scan in shared memory.
75
+ //! Threads within a single warp rake across segments of shared partial reductions,
76
+ //! seeded with the warp-scan output.
77
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
78
+ //! seeded with the raking scan output.
79
+ //!
80
+ //! Performance Considerations
81
+ //! ++++++++++++++++++++++++++
82
+ //!
83
+ //! - Although this variant may suffer longer turnaround latencies when the
84
+ //! GPU is under-occupied, it can often provide higher overall throughput
85
+ //! across the GPU when suitably occupied.
86
+ //!
87
+ //! @endrst
88
+ BLOCK_SCAN_RAKING,
89
+
90
+ //! @rst
91
+ //! Overview
92
+ //! ++++++++++++++++++++++++++
93
+ //!
94
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
95
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
96
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
97
+ //!
98
+ //! @endrst
99
+ BLOCK_SCAN_RAKING_MEMOIZE,
100
+
101
+ //! @rst
102
+ //! Overview
103
+ //! ++++++++++++++++++++++++++
104
+ //!
105
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
106
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
107
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
108
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
109
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
110
+ //! from each preceding warp.
111
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
112
+ //! seeded with the raking scan output.
113
+ //!
114
+ //! Performance Considerations
115
+ //! ++++++++++++++++++++++++++
116
+ //!
117
+ //! - Although this variant may suffer lower overall throughput across the
118
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
119
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
120
+ //!
121
+ //! @endrst
122
+ BLOCK_SCAN_WARP_SCANS,
123
+ };
124
+
125
+ //! @rst
126
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
127
+ //! sum/scan of items partitioned across a CUDA thread block.
128
+ //!
129
+ //! Overview
130
+ //! +++++++++++++++++++++++++++++++++++++++++++++
131
+ //!
132
+ //! - Given a list of input elements and a binary reduction operator, a
133
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
134
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
135
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
136
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
137
+ //! the *i*\ :sup:`th` output reduction.
138
+ //! - @rowmajor
139
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
140
+ //!
141
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
142
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
143
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
144
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
145
+ //! register pressure for intermediate storage.
146
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
147
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
148
+ //!
149
+ //! Performance Considerations
150
+ //! +++++++++++++++++++++++++++++++++++++++++++++
151
+ //!
152
+ //! - @granularity
153
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
154
+ //! - Uses synchronization-free communication between warp lanes when applicable
155
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
156
+ //! one or two depending on algorithm selection)
157
+ //! - Incurs zero bank conflicts for most types
158
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
159
+ //!
160
+ //! - Prefix sum variants (vs. generic scan)
161
+ //! - @blocksize
162
+ //!
163
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
164
+ //!
165
+ //! A Simple Example
166
+ //! +++++++++++++++++++++++++++++++++++++++++++++
167
+ //!
168
+ //! @blockcollective{BlockScan}
169
+ //!
170
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
171
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
172
+ //! where each thread owns 4 consecutive items.
173
+ //!
174
+ //! .. code-block:: c++
175
+ //!
176
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
177
+ //!
178
+ //! __global__ void ExampleKernel(...)
179
+ //! {
180
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
181
+ //! using BlockScan = cub::BlockScan<int, 128>;
182
+ //!
183
+ //! // Allocate shared memory for BlockScan
184
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
185
+ //!
186
+ //! // Obtain a segment of consecutive items that are blocked across threads
187
+ //! int thread_data[4];
188
+ //! ...
189
+ //!
190
+ //! // Collectively compute the block-wide exclusive prefix sum
191
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
192
+ //!
193
+ //! Suppose the set of input ``thread_data`` across the block of threads is
194
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
195
+ //! The corresponding output ``thread_data`` in those threads will be
196
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
197
+ //!
198
+ //! Re-using dynamically allocating shared memory
199
+ //! +++++++++++++++++++++++++++++++++++++++++++++
200
+ //!
201
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
202
+ //! BlockReduce and how to re-purpose the same memory region.
203
+ //! This example can be easily adapted to the storage required by BlockScan.
204
+ //!
205
+ //! @endrst
206
+ //!
207
+ //! @tparam T
208
+ //! Data type being scanned
209
+ //!
210
+ //! @tparam BLOCK_DIM_X
211
+ //! The thread block length in threads along the X dimension
212
+ //!
213
+ //! @tparam ALGORITHM
214
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
215
+ //! (default: cub::BLOCK_SCAN_RAKING)
216
+ //!
217
+ //! @tparam BLOCK_DIM_Y
218
+ //! **[optional]** The thread block length in threads along the Y dimension
219
+ //! (default: 1)
220
+ //!
221
+ //! @tparam BLOCK_DIM_Z
222
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
223
+ //!
224
+ template <typename T,
225
+ int BLOCK_DIM_X,
226
+ BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
227
+ int BLOCK_DIM_Y = 1,
228
+ int BLOCK_DIM_Z = 1>
229
+ class BlockScan
230
+ {
231
+ private:
232
+ /// Constants
233
+ enum
234
+ {
235
+ /// The thread block size in threads
236
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
237
+ };
238
+
239
+ /**
240
+ * Ensure the template parameterization meets the requirements of the
241
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
242
+ * cannot be used with thread block sizes not a multiple of the
243
+ * architectural warp size.
244
+ */
245
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
246
+ ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
247
+ ? BLOCK_SCAN_RAKING
248
+ : ALGORITHM;
249
+
250
+ using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
251
+ using Raking =
252
+ detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
253
+
254
+ /// Define the delegate type for the desired algorithm
255
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
256
+
257
+ /// Shared memory storage layout type for BlockScan
258
+ using _TempStorage = typename InternalBlockScan::TempStorage;
259
+
260
+ /// Shared storage reference
261
+ _TempStorage& temp_storage;
262
+
263
+ /// Linear thread-id
264
+ unsigned int linear_tid;
265
+
266
+ /// Internal storage allocator
267
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
268
+ {
269
+ __shared__ _TempStorage private_storage;
270
+ return private_storage;
271
+ }
272
+
273
+ public:
274
+ /// @smemstorage{BlockScan}
275
+ struct TempStorage : Uninitialized<_TempStorage>
276
+ {};
277
+
278
+ //! @name Collective constructors
279
+ //! @{
280
+
281
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
282
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
283
+ : temp_storage(PrivateStorage())
284
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
285
+ {}
286
+
287
+ /**
288
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
289
+ *
290
+ * @param[in] temp_storage
291
+ * Reference to memory allocation having layout type TempStorage
292
+ */
293
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
294
+ : temp_storage(temp_storage.Alias())
295
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
296
+ {}
297
+
298
+ //! @} end member group
299
+ //! @name Exclusive prefix sum operations
300
+ //! @{
301
+
302
+ //! @rst
303
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
304
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
305
+ //! to ``output`` in *thread*\ :sub:`0`.
306
+ //!
307
+ //! - @identityzero
308
+ //! - @rowmajor
309
+ //! - @smemreuse
310
+ //!
311
+ //! Snippet
312
+ //! +++++++
313
+ //!
314
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
315
+ //! are partitioned across 128 threads.
316
+ //!
317
+ //! .. code-block:: c++
318
+ //!
319
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
320
+ //!
321
+ //! __global__ void ExampleKernel(...)
322
+ //! {
323
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
324
+ //! using BlockScan = cub::BlockScan<int, 128>;
325
+ //!
326
+ //! // Allocate shared memory for BlockScan
327
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
328
+ //!
329
+ //! // Obtain input item for each thread
330
+ //! int thread_data;
331
+ //! ...
332
+ //!
333
+ //! // Collectively compute the block-wide exclusive prefix sum
334
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
335
+ //!
336
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
337
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
338
+ //!
339
+ //! @endrst
340
+ //!
341
+ //! @param[in] input
342
+ //! Calling thread's input item
343
+ //!
344
+ //! @param[out] output
345
+ //! Calling thread's output item (may be aliased to `input`)
346
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
347
+ {
348
+ T initial_value{};
349
+
350
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
351
+ }
352
+
353
+ //! @rst
354
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
355
+ //! Each thread contributes one input element.
356
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
357
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
358
+ //!
359
+ //! - @identityzero
360
+ //! - @rowmajor
361
+ //! - @smemreuse
362
+ //!
363
+ //! Snippet
364
+ //! +++++++
365
+ //!
366
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
367
+ //! are partitioned across 128 threads.
368
+ //!
369
+ //! .. code-block:: c++
370
+ //!
371
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
372
+ //!
373
+ //! __global__ void ExampleKernel(...)
374
+ //! {
375
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
376
+ //! using BlockScan = cub::BlockScan<int, 128>;
377
+ //!
378
+ //! // Allocate shared memory for BlockScan
379
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
380
+ //!
381
+ //! // Obtain input item for each thread
382
+ //! int thread_data;
383
+ //! ...
384
+ //!
385
+ //! // Collectively compute the block-wide exclusive prefix sum
386
+ //! int block_aggregate;
387
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
388
+ //!
389
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
390
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
391
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
392
+ //!
393
+ //! @endrst
394
+ //!
395
+ //! @param[in] input
396
+ //! Calling thread's input item
397
+ //!
398
+ //! @param[out] output
399
+ //! Calling thread's output item (may be aliased to `input`)
400
+ //!
401
+ //! @param[out] block_aggregate
402
+ //! block-wide aggregate reduction of input items
403
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
404
+ {
405
+ T initial_value{};
406
+
407
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
408
+ }
409
+
410
+ //! @rst
411
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
412
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
413
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
414
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
415
+ //! scan inputs.
416
+ //!
417
+ //! - @identityzero
418
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
419
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
420
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
421
+ //! - @rowmajor
422
+ //! - @smemreuse
423
+ //!
424
+ //! Snippet
425
+ //! +++++++
426
+ //!
427
+ //! The code snippet below illustrates a single thread block that progressively
428
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
429
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
430
+ //! of 128 integer items that are partitioned across 128 threads.
431
+ //!
432
+ //! .. code-block:: c++
433
+ //!
434
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
435
+ //!
436
+ //! // A stateful callback functor that maintains a running prefix to be applied
437
+ //! // during consecutive scan operations.
438
+ //! struct BlockPrefixCallbackOp
439
+ //! {
440
+ //! // Running prefix
441
+ //! int running_total;
442
+ //!
443
+ //! // Constructor
444
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
445
+ //!
446
+ //! // Callback operator to be entered by the first warp of threads in the block.
447
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
448
+ //! __device__ int operator()(int block_aggregate)
449
+ //! {
450
+ //! int old_prefix = running_total;
451
+ //! running_total += block_aggregate;
452
+ //! return old_prefix;
453
+ //! }
454
+ //! };
455
+ //!
456
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
457
+ //! {
458
+ //! // Specialize BlockScan for a 1D block of 128 threads
459
+ //! using BlockScan = cub::BlockScan<int, 128>;
460
+ //!
461
+ //! // Allocate shared memory for BlockScan
462
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
463
+ //!
464
+ //! // Initialize running total
465
+ //! BlockPrefixCallbackOp prefix_op(0);
466
+ //!
467
+ //! // Have the block iterate over segments of items
468
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
469
+ //! {
470
+ //! // Load a segment of consecutive items that are blocked across threads
471
+ //! int thread_data = d_data[block_offset + threadIdx.x];
472
+ //!
473
+ //! // Collectively compute the block-wide exclusive prefix sum
474
+ //! BlockScan(temp_storage).ExclusiveSum(
475
+ //! thread_data, thread_data, prefix_op);
476
+ //! __syncthreads();
477
+ //!
478
+ //! // Store scanned items to output segment
479
+ //! d_data[block_offset + threadIdx.x] = thread_data;
480
+ //! }
481
+ //!
482
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
483
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
484
+ //! The output for the second segment will be ``128, 129, ..., 255``.
485
+ //!
486
+ //! @endrst
487
+ //!
488
+ //! @tparam BlockPrefixCallbackOp
489
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
490
+ //!
491
+ //! @param[in] input
492
+ //! Calling thread's input item
493
+ //!
494
+ //! @param[out] output
495
+ //! Calling thread's output item (may be aliased to `input`)
496
+ //!
497
+ //! @param[in,out] block_prefix_callback_op
498
+ //! @rst
499
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
500
+ //! the logical input sequence.
501
+ //! @endrst
502
+ template <typename BlockPrefixCallbackOp>
503
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
504
+ {
505
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
506
+ }
507
+
508
+ //! @} end member group
509
+ //! @name Exclusive prefix sum operations (multiple data per thread)
510
+ //! @{
511
+
512
+ //! @rst
513
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
514
+ //! Each thread contributes an array of consecutive input elements.
515
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
516
+ //!
517
+ //! - @identityzero
518
+ //! - @blocked
519
+ //! - @granularity
520
+ //! - @smemreuse
521
+ //!
522
+ //! Snippet
523
+ //! +++++++
524
+ //!
525
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
526
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
527
+ //! where each thread owns 4 consecutive items.
528
+ //!
529
+ //! .. code-block:: c++
530
+ //!
531
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
532
+ //!
533
+ //! __global__ void ExampleKernel(...)
534
+ //! {
535
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
536
+ //! using BlockScan = cub::BlockScan<int, 128>;
537
+ //!
538
+ //! // Allocate shared memory for BlockScan
539
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
540
+ //!
541
+ //! // Obtain a segment of consecutive items that are blocked across threads
542
+ //! int thread_data[4];
543
+ //! ...
544
+ //!
545
+ //! // Collectively compute the block-wide exclusive prefix sum
546
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
547
+ //!
548
+ //! Suppose the set of input ``thread_data`` across the block of threads is
549
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
550
+ //! The corresponding output ``thread_data`` in those threads will be
551
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
552
+ //!
553
+ //! @endrst
554
+ //!
555
+ //! @tparam ITEMS_PER_THREAD
556
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
557
+ //!
558
+ //! @param[in] input
559
+ //! Calling thread's input items
560
+ //!
561
+ //! @param[out] output
562
+ //! Calling thread's output items (may be aliased to `input`)
563
+ template <int ITEMS_PER_THREAD>
564
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
565
+ {
566
+ T initial_value{};
567
+
568
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
569
+ }
570
+
571
+ //! @rst
572
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
573
+ //! Each thread contributes an array of consecutive input elements.
574
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
575
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
576
+ //!
577
+ //! - @identityzero
578
+ //! - @blocked
579
+ //! - @granularity
580
+ //! - @smemreuse
581
+ //!
582
+ //! Snippet
583
+ //! +++++++
584
+ //!
585
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
586
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
587
+ //! 4 consecutive items.
588
+ //!
589
+ //! .. code-block:: c++
590
+ //!
591
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
592
+ //!
593
+ //! __global__ void ExampleKernel(...)
594
+ //! {
595
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
596
+ //! using BlockScan = cub::BlockScan<int, 128>;
597
+ //!
598
+ //! // Allocate shared memory for BlockScan
599
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
600
+ //!
601
+ //! // Obtain a segment of consecutive items that are blocked across threads
602
+ //! int thread_data[4];
603
+ //! ...
604
+ //!
605
+ //! // Collectively compute the block-wide exclusive prefix sum
606
+ //! int block_aggregate;
607
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
608
+ //!
609
+ //! Suppose the set of input ``thread_data`` across the block of threads is
610
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
611
+ //! The corresponding output ``thread_data`` in those threads will be
612
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
613
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
614
+ //!
615
+ //! @endrst
616
+ //!
617
+ //! @tparam ITEMS_PER_THREAD
618
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
619
+ //!
620
+ //! @param[in] input
621
+ //! Calling thread's input items
622
+ //!
623
+ //! @param[out] output
624
+ //! Calling thread's output items (may be aliased to `input`)
625
+ //!
626
+ //! @param[out] block_aggregate
627
+ //! block-wide aggregate reduction of input items
628
+ template <int ITEMS_PER_THREAD>
629
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
630
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
631
+ {
632
+ // Reduce consecutive thread items in registers
633
+ T initial_value{};
634
+
635
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
636
+ }
637
+
638
+ //! @rst
639
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
640
+ //! Each thread contributes an array of consecutive input elements.
641
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
642
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
643
+ //! value that logically prefixes the thread block's scan inputs.
644
+ //!
645
+ //! - @identityzero
646
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
647
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
648
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
649
+ //! - @blocked
650
+ //! - @granularity
651
+ //! - @smemreuse
652
+ //!
653
+ //!
654
+ //! Snippet
655
+ //! +++++++
656
+ //!
657
+ //! The code snippet below illustrates a single thread block that progressively
658
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
659
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
660
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
661
+ //! across 128 threads where each thread owns 4 consecutive items.
662
+ //!
663
+ //! .. code-block:: c++
664
+ //!
665
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
666
+ //!
667
+ //! // A stateful callback functor that maintains a running prefix to be applied
668
+ //! // during consecutive scan operations.
669
+ //! struct BlockPrefixCallbackOp
670
+ //! {
671
+ //! // Running prefix
672
+ //! int running_total;
673
+ //!
674
+ //! // Constructor
675
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
676
+ //!
677
+ //! // Callback operator to be entered by the first warp of threads in the block.
678
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
679
+ //! __device__ int operator()(int block_aggregate)
680
+ //! {
681
+ //! int old_prefix = running_total;
682
+ //! running_total += block_aggregate;
683
+ //! return old_prefix;
684
+ //! }
685
+ //! };
686
+ //!
687
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
688
+ //! {
689
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
690
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>;
691
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>;
692
+ //! using BlockScan = cub::BlockScan<int, 128>;
693
+ //!
694
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
695
+ //! __shared__ union {
696
+ //! typename BlockLoad::TempStorage load;
697
+ //! typename BlockScan::TempStorage scan;
698
+ //! typename BlockStore::TempStorage store;
699
+ //! } temp_storage;
700
+ //!
701
+ //! // Initialize running total
702
+ //! BlockPrefixCallbackOp prefix_op(0);
703
+ //!
704
+ //! // Have the block iterate over segments of items
705
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
706
+ //! {
707
+ //! // Load a segment of consecutive items that are blocked across threads
708
+ //! int thread_data[4];
709
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
710
+ //! __syncthreads();
711
+ //!
712
+ //! // Collectively compute the block-wide exclusive prefix sum
713
+ //! int block_aggregate;
714
+ //! BlockScan(temp_storage.scan).ExclusiveSum(
715
+ //! thread_data, thread_data, prefix_op);
716
+ //! __syncthreads();
717
+ //!
718
+ //! // Store scanned items to output segment
719
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
720
+ //! __syncthreads();
721
+ //! }
722
+ //!
723
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
724
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
725
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
726
+ //!
727
+ //! @endrst
728
+ //!
729
+ //! @tparam ITEMS_PER_THREAD
730
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
731
+ //!
732
+ //! @tparam BlockPrefixCallbackOp
733
+ //! **[inferred]** Call-back functor type having member
734
+ //! `T operator()(T block_aggregate)`
735
+ //!
736
+ //! @param[in] input
737
+ //! Calling thread's input items
738
+ //!
739
+ //! @param[out] output
740
+ //! Calling thread's output items (may be aliased to `input`)
741
+ //!
742
+ //! @param[in,out] block_prefix_callback_op
743
+ //! @rst
744
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
745
+ //! the logical input sequence.
746
+ //! @endrst
747
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
748
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
749
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
750
+ {
751
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
752
+ }
753
+
754
+ //! @} end member group // Exclusive prefix sums (multiple data per thread)
755
+ //! @name Exclusive prefix scan operations
756
+ //! @{
757
+
758
+ //! @rst
759
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
760
+ //! Each thread contributes one input element.
761
+ //!
762
+ //! - Supports non-commutative scan operators.
763
+ //! - @rowmajor
764
+ //! - @smemreuse
765
+ //!
766
+ //! Snippet
767
+ //! +++++++
768
+ //!
769
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
770
+ //! are partitioned across 128 threads.
771
+ //!
772
+ //! .. code-block:: c++
773
+ //!
774
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
775
+ //!
776
+ //! __global__ void ExampleKernel(...)
777
+ //! {
778
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
779
+ //! using BlockScan = cub::BlockScan<int, 128>;
780
+ //!
781
+ //! // Allocate shared memory for BlockScan
782
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
783
+ //!
784
+ //! // Obtain input item for each thread
785
+ //! int thread_data;
786
+ //! ...
787
+ //!
788
+ //! // Collectively compute the block-wide exclusive prefix max scan
789
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
790
+ //!
791
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
792
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
793
+ //!
794
+ //! @endrst
795
+ //!
796
+ //! @tparam ScanOp
797
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
798
+ //!
799
+ //! @param[in] input
800
+ //! Calling thread's input item
801
+ //!
802
+ //! @param[out] output
803
+ //! Calling thread's output item (may be aliased to `input`)
804
+ //!
805
+ //! @param[in] initial_value
806
+ //! @rst
807
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
808
+ //! @endrst
809
+ //!
810
+ //! @param[in] scan_op
811
+ //! Binary scan functor
812
+ template <typename ScanOp>
813
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
814
+ {
815
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
816
+ }
817
+
818
+ //! @rst
819
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
820
+ //! Each thread contributes one input element.
821
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
822
+ //!
823
+ //! - Supports non-commutative scan operators.
824
+ //! - @rowmajor
825
+ //! - @smemreuse
826
+ //!
827
+ //! Snippet
828
+ //! +++++++
829
+ //!
830
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
831
+ //! are partitioned across 128 threads.
832
+ //!
833
+ //! .. code-block:: c++
834
+ //!
835
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
836
+ //!
837
+ //! __global__ void ExampleKernel(...)
838
+ //! {
839
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
840
+ //! using BlockScan = cub::BlockScan<int, 128>;
841
+ //!
842
+ //! // Allocate shared memory for BlockScan
843
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
844
+ //!
845
+ //! // Obtain input item for each thread
846
+ //! int thread_data;
847
+ //! ...
848
+ //!
849
+ //! // Collectively compute the block-wide exclusive prefix max scan
850
+ //! int block_aggregate;
851
+ //! BlockScan(temp_storage).ExclusiveScan(
852
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
853
+ //!
854
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
855
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
856
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
857
+ //!
858
+ //! .. note::
859
+ //!
860
+ //! ``initial_value`` is not applied to the block-wide aggregate.
861
+ //!
862
+ //! @endrst
863
+ //!
864
+ //! @tparam ScanOp
865
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
866
+ //!
867
+ //! @param[in] input
868
+ //! Calling thread's input items
869
+ //!
870
+ //! @param[out] output
871
+ //! Calling thread's output items (may be aliased to ``input``)
872
+ //!
873
+ //! @param[in] initial_value
874
+ //! @rst
875
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
876
+ //! taken into account for ``block_aggregate``.
877
+ //!
878
+ //! @endrst
879
+ //!
880
+ //! @param[in] scan_op
881
+ //! Binary scan functor
882
+ //!
883
+ //! @param[out] block_aggregate
884
+ //! block-wide aggregate reduction of input items
885
+ template <typename ScanOp>
886
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
887
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
888
+ {
889
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
890
+ }
891
+
892
+ //! @rst
893
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
894
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
895
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
896
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
897
+ //!
898
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
899
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
900
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
901
+ //! - Supports non-commutative scan operators.
902
+ //! - @rowmajor
903
+ //! - @smemreuse
904
+ //!
905
+ //! Snippet
906
+ //! +++++++
907
+ //!
908
+ //! The code snippet below illustrates a single thread block that progressively
909
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
910
+ //! prefix functor to maintain a running total between block-wide scans.
911
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
912
+ //!
913
+ //! .. code-block:: c++
914
+ //!
915
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
916
+ //!
917
+ //! // A stateful callback functor that maintains a running prefix to be applied
918
+ //! // during consecutive scan operations.
919
+ //! struct BlockPrefixCallbackOp
920
+ //! {
921
+ //! // Running prefix
922
+ //! int running_total;
923
+ //!
924
+ //! // Constructor
925
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
926
+ //!
927
+ //! // Callback operator to be entered by the first warp of threads in the block.
928
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
929
+ //! __device__ int operator()(int block_aggregate)
930
+ //! {
931
+ //! int old_prefix = running_total;
932
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
933
+ //! return old_prefix;
934
+ //! }
935
+ //! };
936
+ //!
937
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
938
+ //! {
939
+ //! // Specialize BlockScan for a 1D block of 128 threads
940
+ //! using BlockScan = cub::BlockScan<int, 128>;
941
+ //!
942
+ //! // Allocate shared memory for BlockScan
943
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
944
+ //!
945
+ //! // Initialize running total
946
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
947
+ //!
948
+ //! // Have the block iterate over segments of items
949
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
950
+ //! {
951
+ //! // Load a segment of consecutive items that are blocked across threads
952
+ //! int thread_data = d_data[block_offset + threadIdx.x];
953
+ //!
954
+ //! // Collectively compute the block-wide exclusive prefix max scan
955
+ //! BlockScan(temp_storage).ExclusiveScan(
956
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
957
+ //! __syncthreads();
958
+ //!
959
+ //! // Store scanned items to output segment
960
+ //! d_data[block_offset + threadIdx.x] = thread_data;
961
+ //! }
962
+ //!
963
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
964
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
965
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
966
+ //!
967
+ //! @endrst
968
+ //!
969
+ //! @tparam ScanOp
970
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
971
+ //!
972
+ //! @tparam BlockPrefixCallbackOp
973
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
974
+ //!
975
+ //! @param[in] input
976
+ //! Calling thread's input item
977
+ //!
978
+ //! @param[out] output
979
+ //! Calling thread's output item (may be aliased to `input`)
980
+ //!
981
+ //! @param[in] scan_op
982
+ //! Binary scan functor
983
+ //!
984
+ //! @param[in,out] block_prefix_callback_op
985
+ //! @rst
986
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
987
+ //! the logical input sequence.
988
+ //! @endrst
989
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
990
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
991
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
992
+ {
993
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
994
+ }
995
+
996
+ //! @} end member group // Inclusive prefix sums
997
+ //! @name Exclusive prefix scan operations (multiple data per thread)
998
+ //! @{
999
+
1000
+ //! @rst
1001
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1002
+ //! Each thread contributes an array of consecutive input elements.
1003
+ //!
1004
+ //! - Supports non-commutative scan operators.
1005
+ //! - @blocked
1006
+ //! - @granularity
1007
+ //! - @smemreuse
1008
+ //!
1009
+ //! Snippet
1010
+ //! +++++++
1011
+ //!
1012
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
1013
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
1014
+ //! across 128 threads where each thread owns 4 consecutive items.
1015
+ //!
1016
+ //! .. code-block:: c++
1017
+ //!
1018
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1019
+ //!
1020
+ //! __global__ void ExampleKernel(...)
1021
+ //! {
1022
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1023
+ //! using BlockScan = cub::BlockScan<int, 128>;
1024
+ //!
1025
+ //! // Allocate shared memory for BlockScan
1026
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1027
+ //!
1028
+ //! // Obtain a segment of consecutive items that are blocked across threads
1029
+ //! int thread_data[4];
1030
+ //! ...
1031
+ //!
1032
+ //! // Collectively compute the block-wide exclusive prefix max scan
1033
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
1034
+ //!
1035
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1036
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1037
+ //! The corresponding output ``thread_data`` in those threads will be
1038
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1039
+ //!
1040
+ //! @endrst
1041
+ //!
1042
+ //! @tparam ITEMS_PER_THREAD
1043
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1044
+ //!
1045
+ //! @tparam ScanOp
1046
+ //! **[inferred]** Binary scan functor type having member
1047
+ //! `T operator()(const T &a, const T &b)`
1048
+ //!
1049
+ //! @param[in] input
1050
+ //! Calling thread's input items
1051
+ //!
1052
+ //! @param[out] output
1053
+ //! Calling thread's output items (may be aliased to `input`)
1054
+ //!
1055
+ //! @param[in] initial_value
1056
+ //! @rst
1057
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
1058
+ //! @endrst
1059
+ //!
1060
+ //! @param[in] scan_op
1061
+ //! Binary scan functor
1062
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1063
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1064
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
1065
+ {
1066
+ // Reduce consecutive thread items in registers
1067
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1068
+
1069
+ // Exclusive thread block-scan
1070
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
1071
+
1072
+ // Exclusive scan in registers with prefix as seed
1073
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1074
+ }
1075
+
1076
+ //! @rst
1077
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1078
+ //! Each thread contributes an array of consecutive input elements.
1079
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1080
+ //!
1081
+ //! - Supports non-commutative scan operators.
1082
+ //! - @blocked
1083
+ //! - @granularity
1084
+ //! - @smemreuse
1085
+ //!
1086
+ //! Snippet
1087
+ //! +++++++
1088
+ //!
1089
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
1090
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
1091
+ //! 4 consecutive items.
1092
+ //!
1093
+ //! .. code-block:: c++
1094
+ //!
1095
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1096
+ //!
1097
+ //! __global__ void ExampleKernel(...)
1098
+ //! {
1099
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1100
+ //! using BlockScan = cub::BlockScan<int, 128>;
1101
+ //!
1102
+ //! // Allocate shared memory for BlockScan
1103
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1104
+ //!
1105
+ //! // Obtain a segment of consecutive items that are blocked across threads
1106
+ //! int thread_data[4];
1107
+ //! ...
1108
+ //!
1109
+ //! // Collectively compute the block-wide exclusive prefix max scan
1110
+ //! int block_aggregate;
1111
+ //! BlockScan(temp_storage).ExclusiveScan(
1112
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
1113
+ //!
1114
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1115
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1116
+ //! The corresponding output ``thread_data`` in those threads will be
1117
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1118
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
1119
+ //!
1120
+ //! .. note::
1121
+ //!
1122
+ //! ``initial_value`` is not applied to the block-wide aggregate.
1123
+ //!
1124
+ //! @endrst
1125
+ //!
1126
+ //! @tparam ITEMS_PER_THREAD
1127
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1128
+ //!
1129
+ //! @tparam ScanOp
1130
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1131
+ //!
1132
+ //! @param[in] input
1133
+ //! Calling thread's input items
1134
+ //!
1135
+ //! @param[out] output
1136
+ //! Calling thread's output items (may be aliased to `input`)
1137
+ //!
1138
+ //! @param[in] initial_value
1139
+ //! @rst
1140
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
1141
+ //! into account for ``block_aggregate``.
1142
+ //! @endrst
1143
+ //!
1144
+ //! @param[in] scan_op
1145
+ //! Binary scan functor
1146
+ //!
1147
+ //! @param[out] block_aggregate
1148
+ //! block-wide aggregate reduction of input items
1149
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1150
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1151
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
1152
+ {
1153
+ // Reduce consecutive thread items in registers
1154
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1155
+
1156
+ // Exclusive thread block-scan
1157
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1158
+
1159
+ // Exclusive scan in registers with prefix as seed
1160
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1161
+ }
1162
+
1163
+ //! @rst
1164
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1165
+ //! Each thread contributes an array of consecutive input elements.
1166
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
1167
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
1168
+ //! block's scan inputs.
1169
+ //!
1170
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1171
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the
1172
+ //! first warp of threads in the block, however only the return value from
1173
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1174
+ //! - Supports non-commutative scan operators.
1175
+ //! - @blocked
1176
+ //! - @granularity
1177
+ //! - @smemreuse
1178
+ //!
1179
+ //! Snippet
1180
+ //! +++++++
1181
+ //!
1182
+ //! The code snippet below illustrates a single thread block that progressively
1183
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
1184
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1185
+ //! of 128 integer items that are partitioned across 128 threads.
1186
+ //!
1187
+ //! .. code-block:: c++
1188
+ //!
1189
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1190
+ //!
1191
+ //! // A stateful callback functor that maintains a running prefix to be applied
1192
+ //! // during consecutive scan operations.
1193
+ //! struct BlockPrefixCallbackOp
1194
+ //! {
1195
+ //! // Running prefix
1196
+ //! int running_total;
1197
+ //!
1198
+ //! // Constructor
1199
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1200
+ //!
1201
+ //! // Callback operator to be entered by the first warp of threads in the block.
1202
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1203
+ //! __device__ int operator()(int block_aggregate)
1204
+ //! {
1205
+ //! int old_prefix = running_total;
1206
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
1207
+ //! return old_prefix;
1208
+ //! }
1209
+ //! };
1210
+ //!
1211
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1212
+ //! {
1213
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1214
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1215
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1216
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1217
+ //!
1218
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1219
+ //! __shared__ union {
1220
+ //! typename BlockLoad::TempStorage load;
1221
+ //! typename BlockScan::TempStorage scan;
1222
+ //! typename BlockStore::TempStorage store;
1223
+ //! } temp_storage;
1224
+ //!
1225
+ //! // Initialize running total
1226
+ //! BlockPrefixCallbackOp prefix_op(0);
1227
+ //!
1228
+ //! // Have the block iterate over segments of items
1229
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1230
+ //! {
1231
+ //! // Load a segment of consecutive items that are blocked across threads
1232
+ //! int thread_data[4];
1233
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1234
+ //! __syncthreads();
1235
+ //!
1236
+ //! // Collectively compute the block-wide exclusive prefix max scan
1237
+ //! BlockScan(temp_storage.scan).ExclusiveScan(
1238
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
1239
+ //! __syncthreads();
1240
+ //!
1241
+ //! // Store scanned items to output segment
1242
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1243
+ //! __syncthreads();
1244
+ //! }
1245
+ //!
1246
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1247
+ //! The corresponding output for the first segment will be
1248
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
1249
+ //! The output for the second segment will be
1250
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1251
+ //!
1252
+ //! @endrst
1253
+ //!
1254
+ //! @tparam ITEMS_PER_THREAD
1255
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1256
+ //!
1257
+ //! @tparam ScanOp
1258
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1259
+ //!
1260
+ //! @tparam BlockPrefixCallbackOp
1261
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1262
+ //!
1263
+ //! @param[in] input
1264
+ //! Calling thread's input items
1265
+ //!
1266
+ //! @param[out] output
1267
+ //! Calling thread's output items (may be aliased to `input`)
1268
+ //!
1269
+ //! @param[in] scan_op
1270
+ //! Binary scan functor
1271
+ //!
1272
+ //! @param[in,out] block_prefix_callback_op
1273
+ //! @rst
1274
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1275
+ //! the logical input sequence.
1276
+ //! @endrst
1277
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1278
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1279
+ T (&input)[ITEMS_PER_THREAD],
1280
+ T (&output)[ITEMS_PER_THREAD],
1281
+ ScanOp scan_op,
1282
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1283
+ {
1284
+ // Reduce consecutive thread items in registers
1285
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1286
+
1287
+ // Exclusive thread block-scan
1288
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1289
+
1290
+ // Exclusive scan in registers with prefix as seed
1291
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1292
+ }
1293
+
1294
+ //! @} end member group
1295
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1296
+
1297
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1298
+ //! @{
1299
+
1300
+ //! @rst
1301
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1302
+ //! Each thread contributes one input element.
1303
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1304
+ //!
1305
+ //! - Supports non-commutative scan operators.
1306
+ //! - @rowmajor
1307
+ //! - @smemreuse
1308
+ //!
1309
+ //! @endrst
1310
+ //!
1311
+ //! @tparam ScanOp
1312
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1313
+ //!
1314
+ //! @param[in] input
1315
+ //! Calling thread's input item
1316
+ //!
1317
+ //! @param[out] output
1318
+ //! Calling thread's output item (may be aliased to `input`)
1319
+ //!
1320
+ //! @param[in] scan_op
1321
+ //! Binary scan functor
1322
+ template <typename ScanOp>
1323
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1324
+ {
1325
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1326
+ }
1327
+
1328
+ //! @rst
1329
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1330
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1331
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1332
+ //! *thread*\ :sub:`0` is undefined.
1333
+ //!
1334
+ //! - Supports non-commutative scan operators.
1335
+ //! - @rowmajor
1336
+ //! - @smemreuse
1337
+ //!
1338
+ //! @endrst
1339
+ //!
1340
+ //! @tparam ScanOp
1341
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1342
+ //!
1343
+ //! @param[in] input
1344
+ //! Calling thread's input item
1345
+ //!
1346
+ //! @param[out] output
1347
+ //! Calling thread's output item (may be aliased to `input`)
1348
+ //!
1349
+ //! @param[in] scan_op
1350
+ //! Binary scan functor
1351
+ //!
1352
+ //! @param[out] block_aggregate
1353
+ //! block-wide aggregate reduction of input items
1354
+ template <typename ScanOp>
1355
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1356
+ {
1357
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1358
+ }
1359
+
1360
+ //! @} end member group // Exclusive prefix scans (no initial value, single datum per thread)
1361
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1362
+ //! @{
1363
+
1364
+ //! @rst
1365
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1366
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1367
+ //! output computed for *thread*\ :sub:`0` is undefined.
1368
+ //!
1369
+ //! - Supports non-commutative scan operators.
1370
+ //! - @blocked
1371
+ //! - @granularity
1372
+ //! - @smemreuse
1373
+ //!
1374
+ //! @endrst
1375
+ //!
1376
+ //! @tparam ITEMS_PER_THREAD
1377
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1378
+ //!
1379
+ //! @tparam ScanOp
1380
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1381
+ //!
1382
+ //! @param[in] input
1383
+ //! Calling thread's input items
1384
+ //!
1385
+ //! @param[out] output
1386
+ //! Calling thread's output items (may be aliased to `input`)
1387
+ //!
1388
+ //! @param[in] scan_op
1389
+ //! Binary scan functor
1390
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1391
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1392
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1393
+ {
1394
+ // Reduce consecutive thread items in registers
1395
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1396
+
1397
+ // Exclusive thread block-scan
1398
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1399
+
1400
+ // Exclusive scan in registers with prefix
1401
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1402
+ }
1403
+
1404
+ //! @rst
1405
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1406
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1407
+ //! with the block-wide ``block_aggregate`` of all inputs.
1408
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1409
+ //!
1410
+ //! - Supports non-commutative scan operators.
1411
+ //! - @blocked
1412
+ //! - @granularity
1413
+ //! - @smemreuse
1414
+ //!
1415
+ //! @endrst
1416
+ //!
1417
+ //! @tparam ITEMS_PER_THREAD
1418
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1419
+ //!
1420
+ //! @tparam ScanOp
1421
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1422
+ //!
1423
+ //! @param[in] input
1424
+ //! Calling thread's input items
1425
+ //!
1426
+ //! @param[out] output
1427
+ //! Calling thread's output items (may be aliased to `input`)
1428
+ //!
1429
+ //! @param[in] scan_op
1430
+ //! Binary scan functor
1431
+ //!
1432
+ //! @param[out] block_aggregate
1433
+ //! block-wide aggregate reduction of input items
1434
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1435
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1436
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1437
+ {
1438
+ // Reduce consecutive thread items in registers
1439
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1440
+
1441
+ // Exclusive thread block-scan
1442
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1443
+
1444
+ // Exclusive scan in registers with prefix
1445
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1446
+ }
1447
+
1448
+ //! @} end member group // Exclusive prefix scans (no initial value, multiple data per thread)
1449
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1450
+
1451
+ //! @name Inclusive prefix sum operations
1452
+ //! @{
1453
+
1454
+ //! @rst
1455
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1456
+ //! as the scan operator. Each thread contributes one input element.
1457
+ //!
1458
+ //! - @rowmajor
1459
+ //! - @smemreuse
1460
+ //!
1461
+ //! Snippet
1462
+ //! +++++++
1463
+ //!
1464
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1465
+ //! are partitioned across 128 threads.
1466
+ //!
1467
+ //! .. code-block:: c++
1468
+ //!
1469
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1470
+ //!
1471
+ //! __global__ void ExampleKernel(...)
1472
+ //! {
1473
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1474
+ //! using BlockScan = cub::BlockScan<int, 128>;
1475
+ //!
1476
+ //! // Allocate shared memory for BlockScan
1477
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1478
+ //!
1479
+ //! // Obtain input item for each thread
1480
+ //! int thread_data;
1481
+ //! ...
1482
+ //!
1483
+ //! // Collectively compute the block-wide inclusive prefix sum
1484
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1485
+ //!
1486
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1487
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1488
+ //!
1489
+ //! @endrst
1490
+ //!
1491
+ //! @param[in] input
1492
+ //! Calling thread's input item
1493
+ //!
1494
+ //! @param[out] output
1495
+ //! Calling thread's output item (may be aliased to `input`)
1496
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1497
+ {
1498
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1499
+ }
1500
+
1501
+ //! @rst
1502
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1503
+ //! Each thread contributes one input element.
1504
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1505
+ //!
1506
+ //! - @rowmajor
1507
+ //! - @smemreuse
1508
+ //!
1509
+ //! Snippet
1510
+ //! +++++++
1511
+ //!
1512
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1513
+ //! are partitioned across 128 threads.
1514
+ //!
1515
+ //! .. code-block:: c++
1516
+ //!
1517
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1518
+ //!
1519
+ //! __global__ void ExampleKernel(...)
1520
+ //! {
1521
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1522
+ //! using BlockScan = cub::BlockScan<int, 128>;
1523
+ //!
1524
+ //! // Allocate shared memory for BlockScan
1525
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1526
+ //!
1527
+ //! // Obtain input item for each thread
1528
+ //! int thread_data;
1529
+ //! ...
1530
+ //!
1531
+ //! // Collectively compute the block-wide inclusive prefix sum
1532
+ //! int block_aggregate;
1533
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1534
+ //!
1535
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1536
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1537
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1538
+ //!
1539
+ //! @endrst
1540
+ //!
1541
+ //! @param[in] input
1542
+ //! Calling thread's input item
1543
+ //!
1544
+ //! @param[out] output
1545
+ //! Calling thread's output item (may be aliased to `input`)
1546
+ //!
1547
+ //! @param[out] block_aggregate
1548
+ //! block-wide aggregate reduction of input items
1549
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1550
+ {
1551
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1552
+ }
1553
+
1554
+ //! @rst
1555
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1556
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1557
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1558
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1559
+ //! scan inputs.
1560
+ //!
1561
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1562
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1563
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1564
+ //! - @rowmajor
1565
+ //! - @smemreuse
1566
+ //!
1567
+ //! Snippet
1568
+ //! +++++++
1569
+ //!
1570
+ //! The code snippet below illustrates a single thread block that progressively
1571
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1572
+ //! prefix functor to maintain a running total between block-wide scans.
1573
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1574
+ //!
1575
+ //! .. code-block:: c++
1576
+ //!
1577
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1578
+ //!
1579
+ //! // A stateful callback functor that maintains a running prefix to be applied
1580
+ //! // during consecutive scan operations.
1581
+ //! struct BlockPrefixCallbackOp
1582
+ //! {
1583
+ //! // Running prefix
1584
+ //! int running_total;
1585
+ //!
1586
+ //! // Constructor
1587
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1588
+ //!
1589
+ //! // Callback operator to be entered by the first warp of threads in the block.
1590
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1591
+ //! __device__ int operator()(int block_aggregate)
1592
+ //! {
1593
+ //! int old_prefix = running_total;
1594
+ //! running_total += block_aggregate;
1595
+ //! return old_prefix;
1596
+ //! }
1597
+ //! };
1598
+ //!
1599
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1600
+ //! {
1601
+ //! // Specialize BlockScan for a 1D block of 128 threads
1602
+ //! using BlockScan = cub::BlockScan<int, 128>;
1603
+ //!
1604
+ //! // Allocate shared memory for BlockScan
1605
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1606
+ //!
1607
+ //! // Initialize running total
1608
+ //! BlockPrefixCallbackOp prefix_op(0);
1609
+ //!
1610
+ //! // Have the block iterate over segments of items
1611
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1612
+ //! {
1613
+ //! // Load a segment of consecutive items that are blocked across threads
1614
+ //! int thread_data = d_data[block_offset + threadIdx.x];
1615
+ //!
1616
+ //! // Collectively compute the block-wide inclusive prefix sum
1617
+ //! BlockScan(temp_storage).InclusiveSum(
1618
+ //! thread_data, thread_data, prefix_op);
1619
+ //! __syncthreads();
1620
+ //!
1621
+ //! // Store scanned items to output segment
1622
+ //! d_data[block_offset + threadIdx.x] = thread_data;
1623
+ //! }
1624
+ //!
1625
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1626
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1627
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1628
+ //!
1629
+ //! @endrst
1630
+ //!
1631
+ //! @tparam BlockPrefixCallbackOp
1632
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1633
+ //!
1634
+ //! @param[in] input
1635
+ //! Calling thread's input item
1636
+ //!
1637
+ //! @param[out] output
1638
+ //! Calling thread's output item (may be aliased to `input`)
1639
+ //!
1640
+ //! @param[in,out] block_prefix_callback_op
1641
+ //! @rst
1642
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1643
+ //! to the logical input sequence.
1644
+ //! @endrst
1645
+ template <typename BlockPrefixCallbackOp>
1646
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1647
+ {
1648
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1649
+ }
1650
+
1651
+ //! @} end member group
1652
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1653
+ //! @{
1654
+
1655
+ //! @rst
1656
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1657
+ //! Each thread contributes an array of consecutive input elements.
1658
+ //!
1659
+ //! - @blocked
1660
+ //! - @granularity
1661
+ //! - @smemreuse
1662
+ //!
1663
+ //! Snippet
1664
+ //! +++++++
1665
+ //!
1666
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1667
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1668
+ //! where each thread owns 4 consecutive items.
1669
+ //!
1670
+ //! .. code-block:: c++
1671
+ //!
1672
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1673
+ //!
1674
+ //! __global__ void ExampleKernel(...)
1675
+ //! {
1676
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1677
+ //! using BlockScan = cub::BlockScan<int, 128>;
1678
+ //!
1679
+ //! // Allocate shared memory for BlockScan
1680
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1681
+ //!
1682
+ //! // Obtain a segment of consecutive items that are blocked across threads
1683
+ //! int thread_data[4];
1684
+ //! ...
1685
+ //!
1686
+ //! // Collectively compute the block-wide inclusive prefix sum
1687
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1688
+ //!
1689
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1690
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1691
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1692
+ //!
1693
+ //! @endrst
1694
+ //!
1695
+ //! @tparam ITEMS_PER_THREAD
1696
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1697
+ //!
1698
+ //! @param[in] input
1699
+ //! Calling thread's input items
1700
+ //!
1701
+ //! @param[out] output
1702
+ //! Calling thread's output items (may be aliased to `input`)
1703
+ template <int ITEMS_PER_THREAD>
1704
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1705
+ {
1706
+ if (ITEMS_PER_THREAD == 1)
1707
+ {
1708
+ InclusiveSum(input[0], output[0]);
1709
+ }
1710
+ else
1711
+ {
1712
+ // Reduce consecutive thread items in registers
1713
+ ::cuda::std::plus<> scan_op;
1714
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1715
+
1716
+ // Exclusive thread block-scan
1717
+ ExclusiveSum(thread_prefix, thread_prefix);
1718
+
1719
+ // Inclusive scan in registers with prefix as seed
1720
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1721
+ }
1722
+ }
1723
+
1724
+ //! @rst
1725
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1726
+ //! Each thread contributes an array of consecutive input elements.
1727
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1728
+ //!
1729
+ //! - @blocked
1730
+ //! - @granularity
1731
+ //! - @smemreuse
1732
+ //!
1733
+ //! Snippet
1734
+ //! +++++++
1735
+ //!
1736
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1737
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1738
+ //! where each thread owns 4 consecutive items.
1739
+ //!
1740
+ //! .. code-block:: c++
1741
+ //!
1742
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1743
+ //!
1744
+ //! __global__ void ExampleKernel(...)
1745
+ //! {
1746
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1747
+ //! using BlockScan = cub::BlockScan<int, 128>;
1748
+ //!
1749
+ //! // Allocate shared memory for BlockScan
1750
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1751
+ //!
1752
+ //! // Obtain a segment of consecutive items that are blocked across threads
1753
+ //! int thread_data[4];
1754
+ //! ...
1755
+ //!
1756
+ //! // Collectively compute the block-wide inclusive prefix sum
1757
+ //! int block_aggregate;
1758
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1759
+ //!
1760
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1761
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1762
+ //! corresponding output ``thread_data`` in those threads will be
1763
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1764
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1765
+ //!
1766
+ //! @endrst
1767
+ //!
1768
+ //! @tparam ITEMS_PER_THREAD
1769
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1770
+ //!
1771
+ //! @param[in] input
1772
+ //! Calling thread's input items
1773
+ //!
1774
+ //! @param[out] output
1775
+ //! Calling thread's output items (may be aliased to `input`)
1776
+ //!
1777
+ //! @param[out] block_aggregate
1778
+ //! block-wide aggregate reduction of input items
1779
+ template <int ITEMS_PER_THREAD>
1780
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1781
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1782
+ {
1783
+ if (ITEMS_PER_THREAD == 1)
1784
+ {
1785
+ InclusiveSum(input[0], output[0], block_aggregate);
1786
+ }
1787
+ else
1788
+ {
1789
+ // Reduce consecutive thread items in registers
1790
+ ::cuda::std::plus<> scan_op;
1791
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1792
+
1793
+ // Exclusive thread block-scan
1794
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1795
+
1796
+ // Inclusive scan in registers with prefix as seed
1797
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1798
+ }
1799
+ }
1800
+
1801
+ //! @rst
1802
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1803
+ //! Each thread contributes an array of consecutive input elements.
1804
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1805
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1806
+ //! value that logically prefixes the thread block's scan inputs.
1807
+ //!
1808
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1809
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1810
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1811
+ //! - @blocked
1812
+ //! - @granularity
1813
+ //! - @smemreuse
1814
+ //!
1815
+ //! Snippet
1816
+ //! +++++++
1817
+ //!
1818
+ //! The code snippet below illustrates a single thread block that progressively
1819
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1820
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1821
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1822
+ //! across 128 threads where each thread owns 4 consecutive items.
1823
+ //!
1824
+ //! .. code-block:: c++
1825
+ //!
1826
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1827
+ //!
1828
+ //! // A stateful callback functor that maintains a running prefix to be applied
1829
+ //! // during consecutive scan operations.
1830
+ //! struct BlockPrefixCallbackOp
1831
+ //! {
1832
+ //! // Running prefix
1833
+ //! int running_total;
1834
+ //!
1835
+ //! // Constructor
1836
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1837
+ //!
1838
+ //! // Callback operator to be entered by the first warp of threads in the block.
1839
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1840
+ //! __device__ int operator()(int block_aggregate)
1841
+ //! {
1842
+ //! int old_prefix = running_total;
1843
+ //! running_total += block_aggregate;
1844
+ //! return old_prefix;
1845
+ //! }
1846
+ //! };
1847
+ //!
1848
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1849
+ //! {
1850
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1851
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1852
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1853
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1854
+ //!
1855
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1856
+ //! __shared__ union {
1857
+ //! typename BlockLoad::TempStorage load;
1858
+ //! typename BlockScan::TempStorage scan;
1859
+ //! typename BlockStore::TempStorage store;
1860
+ //! } temp_storage;
1861
+ //!
1862
+ //! // Initialize running total
1863
+ //! BlockPrefixCallbackOp prefix_op(0);
1864
+ //!
1865
+ //! // Have the block iterate over segments of items
1866
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1867
+ //! {
1868
+ //! // Load a segment of consecutive items that are blocked across threads
1869
+ //! int thread_data[4];
1870
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1871
+ //! __syncthreads();
1872
+ //!
1873
+ //! // Collectively compute the block-wide inclusive prefix sum
1874
+ //! BlockScan(temp_storage.scan).IncluisveSum(
1875
+ //! thread_data, thread_data, prefix_op);
1876
+ //! __syncthreads();
1877
+ //!
1878
+ //! // Store scanned items to output segment
1879
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1880
+ //! __syncthreads();
1881
+ //! }
1882
+ //!
1883
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1884
+ //! The corresponding output for the first segment will be
1885
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1886
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1887
+ //!
1888
+ //! @endrst
1889
+ //!
1890
+ //! @tparam ITEMS_PER_THREAD
1891
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1892
+ //!
1893
+ //! @tparam BlockPrefixCallbackOp
1894
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1895
+ //!
1896
+ //! @param[in] input
1897
+ //! Calling thread's input items
1898
+ //!
1899
+ //! @param[out] output
1900
+ //! Calling thread's output items (may be aliased to `input`)
1901
+ //!
1902
+ //! @param[in,out] block_prefix_callback_op
1903
+ //! @rst
1904
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1905
+ //! logical input sequence.
1906
+ //! @endrst
1907
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1908
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1909
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1910
+ {
1911
+ if (ITEMS_PER_THREAD == 1)
1912
+ {
1913
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1914
+ }
1915
+ else
1916
+ {
1917
+ // Reduce consecutive thread items in registers
1918
+ ::cuda::std::plus<> scan_op;
1919
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1920
+
1921
+ // Exclusive thread block-scan
1922
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1923
+
1924
+ // Inclusive scan in registers with prefix as seed
1925
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1926
+ }
1927
+ }
1928
+
1929
+ //! @} end member group
1930
+ //! @name Inclusive prefix scan operations
1931
+ //! @{
1932
+
1933
+ //! @rst
1934
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1935
+ //! Each thread contributes one input element.
1936
+ //!
1937
+ //! - Supports non-commutative scan operators.
1938
+ //! - @rowmajor
1939
+ //! - @smemreuse
1940
+ //!
1941
+ //! Snippet
1942
+ //! +++++++
1943
+ //!
1944
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1945
+ //! are partitioned across 128 threads.
1946
+ //!
1947
+ //! .. code-block:: c++
1948
+ //!
1949
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1950
+ //!
1951
+ //! __global__ void ExampleKernel(...)
1952
+ //! {
1953
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1954
+ //! using BlockScan = cub::BlockScan<int, 128>;
1955
+ //!
1956
+ //! // Allocate shared memory for BlockScan
1957
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1958
+ //!
1959
+ //! // Obtain input item for each thread
1960
+ //! int thread_data;
1961
+ //! ...
1962
+ //!
1963
+ //! // Collectively compute the block-wide inclusive prefix max scan
1964
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1965
+ //!
1966
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1967
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1968
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1969
+ //!
1970
+ //! @endrst
1971
+ //!
1972
+ //! @tparam ScanOp
1973
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1974
+ //!
1975
+ //! @param[in] input
1976
+ //! Calling thread's input item
1977
+ //!
1978
+ //! @param[out] output
1979
+ //! Calling thread's output item (may be aliased to `input`)
1980
+ //!
1981
+ //! @param[in] scan_op
1982
+ //! Binary scan functor
1983
+ template <typename ScanOp>
1984
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1985
+ {
1986
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1987
+ }
1988
+
1989
+ //! @rst
1990
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1991
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1992
+ //! ``block_aggregate`` of all inputs.
1993
+ //!
1994
+ //! - Supports non-commutative scan operators.
1995
+ //! - @rowmajor
1996
+ //! - @smemreuse
1997
+ //!
1998
+ //! Snippet
1999
+ //! +++++++
2000
+ //!
2001
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
2002
+ //! integer items that are partitioned across 128 threads.
2003
+ //!
2004
+ //! .. code-block:: c++
2005
+ //!
2006
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2007
+ //!
2008
+ //! __global__ void ExampleKernel(...)
2009
+ //! {
2010
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2011
+ //! using BlockScan = cub::BlockScan<int, 128>;
2012
+ //!
2013
+ //! // Allocate shared memory for BlockScan
2014
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2015
+ //!
2016
+ //! // Obtain input item for each thread
2017
+ //! int thread_data;
2018
+ //! ...
2019
+ //!
2020
+ //! // Collectively compute the block-wide inclusive prefix max scan
2021
+ //! int block_aggregate;
2022
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2023
+ //!
2024
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2025
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
2026
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
2027
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
2028
+ //!
2029
+ //! @endrst
2030
+ //!
2031
+ //! @tparam ScanOp
2032
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2033
+ //!
2034
+ //! @param[in] input
2035
+ //! Calling thread's input item
2036
+ //!
2037
+ //! @param[out] output
2038
+ //! Calling thread's output item (may be aliased to `input`)
2039
+ //!
2040
+ //! @param[in] scan_op
2041
+ //! Binary scan functor
2042
+ //!
2043
+ //! @param[out] block_aggregate
2044
+ //! Block-wide aggregate reduction of input items
2045
+ template <typename ScanOp>
2046
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
2047
+ {
2048
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
2049
+ }
2050
+
2051
+ //! @rst
2052
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2053
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
2054
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
2055
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
2056
+ //!
2057
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
2058
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
2059
+ //! The functor will be invoked by the first warp of threads in the block,
2060
+ //! however only the return value from *lane*\ :sub:`0` is applied
2061
+ //! as the block-wide prefix. Can be stateful.
2062
+ //! - Supports non-commutative scan operators.
2063
+ //! - @rowmajor
2064
+ //! - @smemreuse
2065
+ //!
2066
+ //! Snippet
2067
+ //! +++++++
2068
+ //!
2069
+ //! The code snippet below illustrates a single thread block that progressively
2070
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2071
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2072
+ //! of 128 integer items that are partitioned across 128 threads.
2073
+ //!
2074
+ //! .. code-block:: c++
2075
+ //!
2076
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2077
+ //!
2078
+ //! // A stateful callback functor that maintains a running prefix to be applied
2079
+ //! // during consecutive scan operations.
2080
+ //! struct BlockPrefixCallbackOp
2081
+ //! {
2082
+ //! // Running prefix
2083
+ //! int running_total;
2084
+ //!
2085
+ //! // Constructor
2086
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2087
+ //!
2088
+ //! // Callback operator to be entered by the first warp of threads in the block.
2089
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2090
+ //! __device__ int operator()(int block_aggregate)
2091
+ //! {
2092
+ //! int old_prefix = running_total;
2093
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2094
+ //! return old_prefix;
2095
+ //! }
2096
+ //! };
2097
+ //!
2098
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2099
+ //! {
2100
+ //! // Specialize BlockScan for a 1D block of 128 threads
2101
+ //! using BlockScan = cub::BlockScan<int, 128>;
2102
+ //!
2103
+ //! // Allocate shared memory for BlockScan
2104
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2105
+ //!
2106
+ //! // Initialize running total
2107
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
2108
+ //!
2109
+ //! // Have the block iterate over segments of items
2110
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
2111
+ //! {
2112
+ //! // Load a segment of consecutive items that are blocked across threads
2113
+ //! int thread_data = d_data[block_offset + threadIdx.x];
2114
+ //!
2115
+ //! // Collectively compute the block-wide inclusive prefix max scan
2116
+ //! BlockScan(temp_storage).InclusiveScan(
2117
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2118
+ //! __syncthreads();
2119
+ //!
2120
+ //! // Store scanned items to output segment
2121
+ //! d_data[block_offset + threadIdx.x] = thread_data;
2122
+ //! }
2123
+ //!
2124
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2125
+ //! The corresponding output for the first segment will be
2126
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
2127
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
2128
+ //!
2129
+ //! @endrst
2130
+ //!
2131
+ //! @tparam ScanOp
2132
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2133
+ //!
2134
+ //! @tparam BlockPrefixCallbackOp
2135
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2136
+ //!
2137
+ //! @param[in] input
2138
+ //! Calling thread's input item
2139
+ //!
2140
+ //! @param[out] output
2141
+ //! Calling thread's output item (may be aliased to `input`)
2142
+ //!
2143
+ //! @param[in] scan_op
2144
+ //! Binary scan functor
2145
+ //!
2146
+ //! @param[in,out] block_prefix_callback_op
2147
+ //! @rst
2148
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2149
+ //! the logical input sequence.
2150
+ //! @endrst
2151
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
2152
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2153
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
2154
+ {
2155
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
2156
+ }
2157
+
2158
+ //! @} end member group
2159
+ //! @name Inclusive prefix scan operations (multiple data per thread)
2160
+ //! @{
2161
+
2162
+ //! @rst
2163
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2164
+ //! Each thread contributes an array of consecutive input elements.
2165
+ //!
2166
+ //! - Supports non-commutative scan operators.
2167
+ //! - @blocked
2168
+ //! - @granularity
2169
+ //! - @smemreuse
2170
+ //!
2171
+ //! Snippet
2172
+ //! +++++++
2173
+ //!
2174
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2175
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2176
+ //! where each thread owns 4 consecutive items.
2177
+ //!
2178
+ //! .. code-block:: c++
2179
+ //!
2180
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2181
+ //!
2182
+ //! __global__ void ExampleKernel(...)
2183
+ //! {
2184
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2185
+ //! using BlockScan = cub::BlockScan<int, 128>;
2186
+ //!
2187
+ //! // Allocate shared memory for BlockScan
2188
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2189
+ //!
2190
+ //! // Obtain a segment of consecutive items that are blocked across threads
2191
+ //! int thread_data[4];
2192
+ //! ...
2193
+ //!
2194
+ //! // Collectively compute the block-wide inclusive prefix max scan
2195
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
2196
+ //!
2197
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2198
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2199
+ //! The corresponding output ``thread_data`` in those threads will be
2200
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2201
+ //!
2202
+ //! @endrst
2203
+ //!
2204
+ //! @tparam ITEMS_PER_THREAD
2205
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2206
+ //!
2207
+ //! @tparam ScanOp
2208
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2209
+ //!
2210
+ //! @param[in] input
2211
+ //! Calling thread's input items
2212
+ //!
2213
+ //! @param[out] output
2214
+ //! Calling thread's output items (may be aliased to `input`)
2215
+ //!
2216
+ //! @param[in] scan_op
2217
+ //! Binary scan functor
2218
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2219
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2220
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
2221
+ {
2222
+ if (ITEMS_PER_THREAD == 1)
2223
+ {
2224
+ InclusiveScan(input[0], output[0], scan_op);
2225
+ }
2226
+ else
2227
+ {
2228
+ // Reduce consecutive thread items in registers
2229
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2230
+
2231
+ // Exclusive thread block-scan
2232
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
2233
+
2234
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2235
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2236
+ }
2237
+ }
2238
+
2239
+ //! @rst
2240
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2241
+ //! Each thread contributes an array of consecutive input elements.
2242
+ //!
2243
+ //! - Supports non-commutative scan operators.
2244
+ //! - @blocked
2245
+ //! - @granularity
2246
+ //! - @smemreuse
2247
+ //!
2248
+ //! Snippet
2249
+ //! +++++++
2250
+ //!
2251
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2252
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2253
+ //! where each thread owns 2 consecutive items.
2254
+ //!
2255
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2256
+ //! :language: c++
2257
+ //! :dedent:
2258
+ //! :start-after: example-begin inclusive-scan-array-init-value
2259
+ //! :end-before: example-end inclusive-scan-array-init-value
2260
+ //!
2261
+ //!
2262
+ //! @endrst
2263
+ //!
2264
+ //! @tparam ITEMS_PER_THREAD
2265
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2266
+ //!
2267
+ //! @tparam ScanOp
2268
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2269
+ //!
2270
+ //! @param[in] input
2271
+ //! Calling thread's input items
2272
+ //!
2273
+ //! @param[out] output
2274
+ //! Calling thread's output items (may be aliased to `input`)
2275
+ //!
2276
+ //! @param[in] initial_value
2277
+ //! Initial value to seed the inclusive scan (uniform across block)
2278
+ //!
2279
+ //! @param[in] scan_op
2280
+ //! Binary scan functor
2281
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2282
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2283
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
2284
+ {
2285
+ // Reduce consecutive thread items in registers
2286
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2287
+
2288
+ // Exclusive thread block-scan
2289
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
2290
+
2291
+ // Exclusive scan in registers with prefix as seed
2292
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2293
+ }
2294
+
2295
+ //! @rst
2296
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2297
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2298
+ //! with the block-wide ``block_aggregate`` of all inputs.
2299
+ //!
2300
+ //! - Supports non-commutative scan operators.
2301
+ //! - @blocked
2302
+ //! - @granularity
2303
+ //! - @smemreuse
2304
+ //!
2305
+ //! Snippet
2306
+ //! +++++++
2307
+ //!
2308
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2309
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2310
+ //! where each thread owns 4 consecutive items.
2311
+ //!
2312
+ //! .. code-block:: c++
2313
+ //!
2314
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2315
+ //!
2316
+ //! __global__ void ExampleKernel(...)
2317
+ //! {
2318
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2319
+ //! using BlockScan = cub::BlockScan<int, 128>;
2320
+ //!
2321
+ //! // Allocate shared memory for BlockScan
2322
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2323
+ //!
2324
+ //! // Obtain a segment of consecutive items that are blocked across threads
2325
+ //! int thread_data[4];
2326
+ //! ...
2327
+ //!
2328
+ //! // Collectively compute the block-wide inclusive prefix max scan
2329
+ //! int block_aggregate;
2330
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2331
+ //!
2332
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2333
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2334
+ //! The corresponding output ``thread_data`` in those threads will be
2335
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2336
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
2337
+ //!
2338
+ //! @endrst
2339
+ //!
2340
+ //! @tparam ITEMS_PER_THREAD
2341
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2342
+ //!
2343
+ //! @tparam ScanOp
2344
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2345
+ //!
2346
+ //! @param[in] input
2347
+ //! Calling thread's input items
2348
+ //!
2349
+ //! @param[out] output
2350
+ //! Calling thread's output items (may be aliased to `input`)
2351
+ //!
2352
+ //! @param[in] scan_op
2353
+ //! Binary scan functor
2354
+ //!
2355
+ //! @param[out] block_aggregate
2356
+ //! Block-wide aggregate reduction of input items
2357
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2358
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2359
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
2360
+ {
2361
+ if (ITEMS_PER_THREAD == 1)
2362
+ {
2363
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2364
+ }
2365
+ else
2366
+ {
2367
+ // Reduce consecutive thread items in registers
2368
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2369
+
2370
+ // Exclusive thread block-scan (with no initial value)
2371
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
2372
+
2373
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2374
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2375
+ }
2376
+ }
2377
+
2378
+ //! @rst
2379
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2380
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2381
+ //! with the block-wide ``block_aggregate`` of all inputs.
2382
+ //!
2383
+ //! - Supports non-commutative scan operators.
2384
+ //! - @blocked
2385
+ //! - @granularity
2386
+ //! - @smemreuse
2387
+ //!
2388
+ //! Snippet
2389
+ //! +++++++
2390
+ //!
2391
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2392
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2393
+ //! where each thread owns 2 consecutive items.
2394
+ //!
2395
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2396
+ //! :language: c++
2397
+ //! :dedent:
2398
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
2399
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
2400
+ //!
2401
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
2402
+ //!
2403
+ //! .. note::
2404
+ //!
2405
+ //! ``initial_value`` is not applied to the block-wide aggregate.
2406
+ //!
2407
+ //! @endrst
2408
+ //!
2409
+ //! @tparam ITEMS_PER_THREAD
2410
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2411
+ //!
2412
+ //! @tparam ScanOp
2413
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2414
+ //!
2415
+ //! @param[in] input
2416
+ //! Calling thread's input items
2417
+ //!
2418
+ //! @param[out] output
2419
+ //! Calling thread's output items (may be aliased to `input`)
2420
+ //!
2421
+ //! @param[in] initial_value
2422
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2423
+ //! into account for ``block_aggregate``.
2424
+ //!
2425
+ //! @param[in] scan_op
2426
+ //! Binary scan functor
2427
+ //!
2428
+ //! @param[out] block_aggregate
2429
+ //! Block-wide aggregate reduction of input items
2430
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2431
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2432
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2433
+ {
2434
+ // Reduce consecutive thread items in registers
2435
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2436
+
2437
+ // Exclusive thread block-scan
2438
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2439
+
2440
+ // Exclusive scan in registers with prefix as seed
2441
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2442
+ }
2443
+
2444
+ //! @rst
2445
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2446
+ //! Each thread contributes an array of consecutive input elements.
2447
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2448
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2449
+ //! thread block's scan inputs.
2450
+ //!
2451
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2452
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2453
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2454
+ //! - Supports non-commutative scan operators.
2455
+ //! - @blocked
2456
+ //! - @granularity
2457
+ //! - @smemreuse
2458
+ //!
2459
+ //! Snippet
2460
+ //! +++++++
2461
+ //!
2462
+ //! The code snippet below illustrates a single thread block that progressively
2463
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2464
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2465
+ //! of 128 integer items that are partitioned across 128 threads.
2466
+ //!
2467
+ //! .. code-block:: c++
2468
+ //!
2469
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2470
+ //!
2471
+ //! // A stateful callback functor that maintains a running prefix to be applied
2472
+ //! // during consecutive scan operations.
2473
+ //! struct BlockPrefixCallbackOp
2474
+ //! {
2475
+ //! // Running prefix
2476
+ //! int running_total;
2477
+ //!
2478
+ //! // Constructor
2479
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2480
+ //!
2481
+ //! // Callback operator to be entered by the first warp of threads in the block.
2482
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2483
+ //! __device__ int operator()(int block_aggregate)
2484
+ //! {
2485
+ //! int old_prefix = running_total;
2486
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2487
+ //! return old_prefix;
2488
+ //! }
2489
+ //! };
2490
+ //!
2491
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2492
+ //! {
2493
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2494
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2495
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2496
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2497
+ //!
2498
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2499
+ //! __shared__ union {
2500
+ //! typename BlockLoad::TempStorage load;
2501
+ //! typename BlockScan::TempStorage scan;
2502
+ //! typename BlockStore::TempStorage store;
2503
+ //! } temp_storage;
2504
+ //!
2505
+ //! // Initialize running total
2506
+ //! BlockPrefixCallbackOp prefix_op(0);
2507
+ //!
2508
+ //! // Have the block iterate over segments of items
2509
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2510
+ //! {
2511
+ //! // Load a segment of consecutive items that are blocked across threads
2512
+ //! int thread_data[4];
2513
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2514
+ //! __syncthreads();
2515
+ //!
2516
+ //! // Collectively compute the block-wide inclusive prefix max scan
2517
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2518
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2519
+ //! __syncthreads();
2520
+ //!
2521
+ //! // Store scanned items to output segment
2522
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2523
+ //! __syncthreads();
2524
+ //! }
2525
+ //!
2526
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2527
+ //! The corresponding output for the first segment will be
2528
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2529
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2530
+ //!
2531
+ //! @endrst
2532
+ //!
2533
+ //! @tparam ITEMS_PER_THREAD
2534
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2535
+ //!
2536
+ //! @tparam ScanOp
2537
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2538
+ //!
2539
+ //! @tparam BlockPrefixCallbackOp
2540
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2541
+ //!
2542
+ //! @param[in] input
2543
+ //! Calling thread's input items
2544
+ //!
2545
+ //! @param[out] output
2546
+ //! Calling thread's output items (may be aliased to `input`)
2547
+ //!
2548
+ //! @param[in] scan_op
2549
+ //! Binary scan functor
2550
+ //!
2551
+ //! @param[in,out] block_prefix_callback_op
2552
+ //! @rst
2553
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2554
+ //! the logical input sequence.
2555
+ //! @endrst
2556
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2557
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2558
+ T (&input)[ITEMS_PER_THREAD],
2559
+ T (&output)[ITEMS_PER_THREAD],
2560
+ ScanOp scan_op,
2561
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2562
+ {
2563
+ if (ITEMS_PER_THREAD == 1)
2564
+ {
2565
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2566
+ }
2567
+ else
2568
+ {
2569
+ // Reduce consecutive thread items in registers
2570
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2571
+
2572
+ // Exclusive thread block-scan
2573
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2574
+
2575
+ // Inclusive scan in registers with prefix as seed
2576
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2577
+ }
2578
+ }
2579
+
2580
+ //! @} end member group
2581
+ };
2582
+
2583
+ CUB_NAMESPACE_END