cuda-cccl 0.1.3.1.0.dev1486__cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1819) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +276 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +953 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
  241. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
  242. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  243. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
  244. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  245. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  247. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  248. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  249. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  259. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  260. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  261. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
  262. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  263. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  264. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  265. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  266. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  267. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  268. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  269. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  270. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  271. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  272. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  273. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  274. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  275. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  276. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  377. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  378. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  379. cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
  380. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
  381. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  382. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  383. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
  384. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  385. cuda/cccl/headers/include/cuda/access_property +26 -0
  386. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  387. cuda/cccl/headers/include/cuda/atomic +27 -0
  388. cuda/cccl/headers/include/cuda/barrier +262 -0
  389. cuda/cccl/headers/include/cuda/bit +29 -0
  390. cuda/cccl/headers/include/cuda/cmath +35 -0
  391. cuda/cccl/headers/include/cuda/discard_memory +61 -0
  392. cuda/cccl/headers/include/cuda/functional +31 -0
  393. cuda/cccl/headers/include/cuda/iterator +31 -0
  394. cuda/cccl/headers/include/cuda/latch +27 -0
  395. cuda/cccl/headers/include/cuda/mdspan +28 -0
  396. cuda/cccl/headers/include/cuda/memory +28 -0
  397. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  398. cuda/cccl/headers/include/cuda/numeric +28 -0
  399. cuda/cccl/headers/include/cuda/pipeline +579 -0
  400. cuda/cccl/headers/include/cuda/ptx +118 -0
  401. cuda/cccl/headers/include/cuda/semaphore +31 -0
  402. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
  403. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
  404. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
  405. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  406. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  407. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  408. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  409. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  495. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  496. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  497. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  498. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  499. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  500. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  501. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  502. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  517. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  518. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  519. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
  520. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  521. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  522. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  523. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  524. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  525. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  526. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  527. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  530. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  531. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
  532. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  533. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
  534. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
  535. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  536. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  537. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
  555. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  556. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  557. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  558. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  559. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  560. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
  561. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
  562. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
  563. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
  564. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
  565. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
  566. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
  567. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
  583. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
  584. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
  585. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
  586. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  587. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  588. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  589. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  590. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  591. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  592. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  593. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  594. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  595. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  596. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  597. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  598. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  599. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  600. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  601. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  602. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  603. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  605. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  606. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  607. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  608. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  609. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  610. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  611. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  612. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  613. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  614. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  615. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  616. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  617. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
  618. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  619. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  620. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  621. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
  622. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
  623. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  624. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
  625. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  626. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  627. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  628. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  629. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  630. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  631. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  632. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  633. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  634. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  635. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  637. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  638. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  639. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  640. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  641. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  642. cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
  643. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
  644. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  645. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
  646. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
  647. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  648. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  649. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  650. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  651. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  652. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  653. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  654. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  655. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
  656. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
  657. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  658. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  659. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  660. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  661. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  662. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  663. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  664. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
  665. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
  667. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  668. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  670. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  671. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  672. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  673. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  674. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  675. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  676. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  677. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  678. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  679. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  680. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  681. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  682. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  683. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  684. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  685. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  686. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  687. cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
  688. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
  689. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
  690. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
  691. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
  692. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
  693. cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
  694. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  696. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
  697. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  698. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
  699. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  700. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  701. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
  702. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
  703. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  704. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  705. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  706. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
  708. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  709. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
  710. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  711. cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
  712. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
  713. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
  724. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  725. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  726. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  727. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  728. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
  729. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
  731. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
  732. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
  733. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  734. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
  735. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
  736. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
  737. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
  738. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
  739. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
  740. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  741. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  742. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
  743. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
  744. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  745. cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
  746. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  747. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  748. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  749. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  751. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
  752. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  753. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  754. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
  755. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  756. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  757. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  758. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
  759. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
  761. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
  762. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  763. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  764. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  765. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  766. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  767. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  768. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  769. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
  770. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  771. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  772. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  773. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  774. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  775. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  776. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  777. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
  778. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
  779. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  780. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  781. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  782. cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
  783. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  784. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  785. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  786. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  787. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  788. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
  789. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  790. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  791. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  792. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  793. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
  794. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  795. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  796. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
  797. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  798. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  799. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  800. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
  801. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
  802. cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
  803. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  804. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
  805. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
  806. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  808. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  809. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  810. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  811. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  812. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
  813. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
  814. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  815. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  816. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  817. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  818. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  819. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  820. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  821. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
  822. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  823. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  824. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  825. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
  826. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  827. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  828. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  829. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  830. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  831. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  832. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  833. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
  834. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  835. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  836. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  837. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  838. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  839. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  840. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  841. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  842. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  843. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  844. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  845. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  846. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  847. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  848. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  849. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  850. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  851. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  852. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  853. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  854. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  855. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  856. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  857. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  858. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  859. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  860. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  861. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  862. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  863. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  864. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  865. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  866. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  867. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  868. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  869. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  870. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  871. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  872. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  873. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  874. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  875. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  876. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  973. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  974. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  975. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  976. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
  977. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  978. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  979. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  980. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  981. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
  982. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  983. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  984. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  985. cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
  986. cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
  987. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  988. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
  989. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  990. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  991. cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
  992. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  993. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
  994. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  995. cuda/cccl/headers/include/cuda/std/array +527 -0
  996. cuda/cccl/headers/include/cuda/std/atomic +823 -0
  997. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  998. cuda/cccl/headers/include/cuda/std/bit +35 -0
  999. cuda/cccl/headers/include/cuda/std/bitset +1026 -0
  1000. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1001. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1002. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1003. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1004. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1005. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1006. cuda/cccl/headers/include/cuda/std/complex +25 -0
  1007. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1008. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1009. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1010. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1011. cuda/cccl/headers/include/cuda/std/cstring +111 -0
  1012. cuda/cccl/headers/include/cuda/std/ctime +147 -0
  1013. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1014. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
  1015. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
  1016. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
  1017. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
  1018. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
  1019. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1020. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
  1021. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1022. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
  1023. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
  1024. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1025. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1026. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1027. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1028. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1029. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1030. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1031. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1032. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1033. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1034. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1035. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1036. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1037. cuda/cccl/headers/include/cuda/std/optional +25 -0
  1038. cuda/cccl/headers/include/cuda/std/ranges +68 -0
  1039. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1040. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1041. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1042. cuda/cccl/headers/include/cuda/std/span +640 -0
  1043. cuda/cccl/headers/include/cuda/std/string_view +814 -0
  1044. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1045. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1046. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1047. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1048. cuda/cccl/headers/include/cuda/std/version +245 -0
  1049. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1050. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1051. cuda/cccl/headers/include/cuda/version +16 -0
  1052. cuda/cccl/headers/include/cuda/warp +28 -0
  1053. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1054. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1055. cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
  1056. cuda/cccl/headers/include/nv/target +229 -0
  1057. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1058. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1059. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1060. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1061. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1062. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1063. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1064. cuda/cccl/headers/include/thrust/count.h +245 -0
  1065. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1066. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1067. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1068. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1069. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1070. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1071. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1072. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1073. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1074. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1075. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1076. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1077. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1078. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1079. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1080. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1081. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1082. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1083. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1084. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1085. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1086. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1087. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1088. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1089. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1090. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1091. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1092. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1093. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1094. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1095. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1096. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1097. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1098. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1099. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1100. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1101. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1102. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1103. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1104. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1105. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1106. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1107. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1108. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1109. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1110. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1111. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1112. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1113. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1114. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1115. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1116. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1117. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1118. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1119. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1120. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1121. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1122. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1123. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1124. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1125. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1126. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1127. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1128. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1129. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1130. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1131. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1132. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1133. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1134. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1135. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1136. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1137. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1138. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1139. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1140. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1141. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1142. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1143. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1144. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1145. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1146. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1147. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1148. cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
  1149. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1150. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
  1151. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1152. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1153. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1154. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1155. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1156. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1157. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1158. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1159. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1160. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1161. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1162. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1163. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1164. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1165. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1166. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1167. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1168. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1169. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1170. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1171. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1172. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1173. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1174. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1175. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1176. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1177. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1178. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1179. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1180. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1181. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1182. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1183. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1184. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1185. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1186. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
  1187. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1188. cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
  1189. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1190. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1191. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
  1192. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1193. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1194. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1195. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1196. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1197. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1198. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1199. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
  1200. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1201. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1202. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1203. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1204. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1205. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1206. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1207. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1208. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1209. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1210. cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
  1211. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
  1212. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1213. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1214. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1215. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1216. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1217. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1218. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1219. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1220. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1221. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1222. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1223. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1224. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1225. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1226. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1227. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1228. cuda/cccl/headers/include/thrust/find.h +382 -0
  1229. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1230. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1231. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1232. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1233. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1234. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1235. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1236. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1237. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1238. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1239. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1240. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1241. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1242. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1243. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1244. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1245. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1246. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1247. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1248. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1249. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1250. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1251. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1252. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
  1253. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1254. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1255. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1256. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
  1257. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1258. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1259. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1260. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1261. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1262. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1263. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1264. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1265. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1266. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1267. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1268. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1269. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1270. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1271. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1272. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1273. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1274. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1275. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1276. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1277. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1278. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1279. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1280. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1281. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1282. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1283. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1284. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1285. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1286. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1287. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1288. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1289. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1290. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1291. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1292. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1293. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1294. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1295. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1296. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1297. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1298. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1299. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1300. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1301. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1302. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1303. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1304. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1305. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1306. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1307. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1308. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1309. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1310. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1311. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1312. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1313. cuda/cccl/headers/include/thrust/random.h +120 -0
  1314. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1315. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1316. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1317. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1318. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1319. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1320. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1321. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1322. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1323. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1324. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1325. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1326. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1327. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1328. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1329. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1330. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1331. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1332. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1333. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1334. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1335. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1336. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1337. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1338. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1339. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1340. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1341. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1342. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1343. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1344. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1345. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1346. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1347. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1348. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1349. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1350. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1351. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1352. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1353. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1354. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1355. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1356. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1357. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1358. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1359. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1360. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1361. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1362. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1363. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1364. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1365. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1366. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1377. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1378. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1379. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1380. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1382. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1383. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1384. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1385. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1386. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1388. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
  1389. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1390. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1391. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1392. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1393. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1394. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1395. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1396. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1397. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1398. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1399. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1400. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1401. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1402. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1403. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1404. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
  1405. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1406. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1408. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1409. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1410. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1411. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1412. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1413. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1414. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1415. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1416. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1417. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
  1418. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1446. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1447. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1448. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1449. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1450. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1451. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1452. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1453. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1454. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1455. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1456. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1457. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1458. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1459. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1460. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1461. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1462. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1463. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1464. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1465. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1466. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1467. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1468. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1469. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1470. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1471. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1472. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1473. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1474. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1475. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1476. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1477. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1478. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1479. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1480. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1481. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1482. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1483. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1484. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1485. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1486. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1487. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1635. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1636. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1637. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1638. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1639. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1640. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1641. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1642. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1643. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1644. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1645. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1646. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1647. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1648. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1649. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1650. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1651. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1652. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1653. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1654. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1655. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1656. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1657. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1658. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1659. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1660. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1661. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1662. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1663. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1664. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1665. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1666. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1668. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1669. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1670. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1671. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1672. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1673. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1674. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1675. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1702. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1703. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1704. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1705. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1706. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1707. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1708. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1709. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1710. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1711. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1712. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1713. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1714. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1715. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1716. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1717. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1718. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1724. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1725. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1726. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1727. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1728. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1730. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1731. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1732. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1734. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1735. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1736. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1737. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1738. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1739. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1740. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1741. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1742. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1767. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1768. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1769. cuda/cccl/headers/include/thrust/transform.h +903 -0
  1770. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1771. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1772. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1773. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1774. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
  1775. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1776. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1777. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1778. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
  1779. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1780. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
  1781. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1782. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1783. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1784. cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
  1785. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1786. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1787. cuda/cccl/headers/include/thrust/version.h +93 -0
  1788. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1789. cuda/cccl/headers/include_paths.py +72 -0
  1790. cuda/cccl/parallel/__init__.py +3 -0
  1791. cuda/cccl/parallel/experimental/__init__.py +3 -0
  1792. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1793. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1794. cuda/cccl/parallel/experimental/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1795. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1796. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1797. cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
  1798. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1799. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1800. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1801. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1802. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1803. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1804. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1805. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1806. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1807. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1808. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1809. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1810. cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
  1811. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1812. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1813. cuda/cccl/parallel/experimental/struct.py +150 -0
  1814. cuda/cccl/parallel/experimental/typing.py +27 -0
  1815. cuda/cccl/py.typed +0 -0
  1816. cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
  1817. cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
  1818. cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
  1819. cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2600 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
31
+ //! sum/scan of items partitioned across a CUDA thread block.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/block/specializations/block_scan_raking.cuh>
46
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
47
+ #include <cub/util_ptx.cuh>
48
+ #include <cub/util_type.cuh>
49
+
50
+ #include <cuda/std/type_traits>
51
+
52
+ CUB_NAMESPACE_BEGIN
53
+
54
+ /******************************************************************************
55
+ * Algorithmic variants
56
+ ******************************************************************************/
57
+
58
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
59
+ //! parallel prefix scan across a CUDA thread block.
60
+ enum BlockScanAlgorithm
61
+ {
62
+
63
+ //! @rst
64
+ //! Overview
65
+ //! ++++++++++++++++++++++++++
66
+ //!
67
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
68
+ //!
69
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
70
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
71
+ //! #. Upsweep sequential reduction in shared memory.
72
+ //! Threads within a single warp rake across segments of shared partial reductions.
73
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
74
+ //! #. Downsweep sequential exclusive scan in shared memory.
75
+ //! Threads within a single warp rake across segments of shared partial reductions,
76
+ //! seeded with the warp-scan output.
77
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
78
+ //! seeded with the raking scan output.
79
+ //!
80
+ //! Performance Considerations
81
+ //! ++++++++++++++++++++++++++
82
+ //!
83
+ //! - Although this variant may suffer longer turnaround latencies when the
84
+ //! GPU is under-occupied, it can often provide higher overall throughput
85
+ //! across the GPU when suitably occupied.
86
+ //!
87
+ //! @endrst
88
+ BLOCK_SCAN_RAKING,
89
+
90
+ //! @rst
91
+ //! Overview
92
+ //! ++++++++++++++++++++++++++
93
+ //!
94
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
95
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
96
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
97
+ //!
98
+ //! @endrst
99
+ BLOCK_SCAN_RAKING_MEMOIZE,
100
+
101
+ //! @rst
102
+ //! Overview
103
+ //! ++++++++++++++++++++++++++
104
+ //!
105
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
106
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
107
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
108
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
109
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
110
+ //! from each preceding warp.
111
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
112
+ //! seeded with the raking scan output.
113
+ //!
114
+ //! Performance Considerations
115
+ //! ++++++++++++++++++++++++++
116
+ //!
117
+ //! - Although this variant may suffer lower overall throughput across the
118
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
119
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
120
+ //!
121
+ //! @endrst
122
+ BLOCK_SCAN_WARP_SCANS,
123
+ };
124
+
125
+ //! @rst
126
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
127
+ //! sum/scan of items partitioned across a CUDA thread block.
128
+ //!
129
+ //! Overview
130
+ //! +++++++++++++++++++++++++++++++++++++++++++++
131
+ //!
132
+ //! - Given a list of input elements and a binary reduction operator, a
133
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
134
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
135
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
136
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
137
+ //! the *i*\ :sup:`th` output reduction.
138
+ //! - @rowmajor
139
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
140
+ //!
141
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
142
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
143
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
144
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
145
+ //! register pressure for intermediate storage.
146
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
147
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
148
+ //!
149
+ //! Performance Considerations
150
+ //! +++++++++++++++++++++++++++++++++++++++++++++
151
+ //!
152
+ //! - @granularity
153
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
154
+ //! - Uses synchronization-free communication between warp lanes when applicable
155
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
156
+ //! one or two depending on algorithm selection)
157
+ //! - Incurs zero bank conflicts for most types
158
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
159
+ //!
160
+ //! - Prefix sum variants (vs. generic scan)
161
+ //! - @blocksize
162
+ //!
163
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
164
+ //!
165
+ //! A Simple Example
166
+ //! +++++++++++++++++++++++++++++++++++++++++++++
167
+ //!
168
+ //! @blockcollective{BlockScan}
169
+ //!
170
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
171
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
172
+ //! where each thread owns 4 consecutive items.
173
+ //!
174
+ //! .. code-block:: c++
175
+ //!
176
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
177
+ //!
178
+ //! __global__ void ExampleKernel(...)
179
+ //! {
180
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
181
+ //! using BlockScan = cub::BlockScan<int, 128>;
182
+ //!
183
+ //! // Allocate shared memory for BlockScan
184
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
185
+ //!
186
+ //! // Obtain a segment of consecutive items that are blocked across threads
187
+ //! int thread_data[4];
188
+ //! ...
189
+ //!
190
+ //! // Collectively compute the block-wide exclusive prefix sum
191
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
192
+ //!
193
+ //! Suppose the set of input ``thread_data`` across the block of threads is
194
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
195
+ //! The corresponding output ``thread_data`` in those threads will be
196
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
197
+ //!
198
+ //! Re-using dynamically allocating shared memory
199
+ //! +++++++++++++++++++++++++++++++++++++++++++++
200
+ //!
201
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
202
+ //! BlockReduce and how to re-purpose the same memory region.
203
+ //! This example can be easily adapted to the storage required by BlockScan.
204
+ //!
205
+ //! @endrst
206
+ //!
207
+ //! @tparam T
208
+ //! Data type being scanned
209
+ //!
210
+ //! @tparam BLOCK_DIM_X
211
+ //! The thread block length in threads along the X dimension
212
+ //!
213
+ //! @tparam ALGORITHM
214
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
215
+ //! (default: cub::BLOCK_SCAN_RAKING)
216
+ //!
217
+ //! @tparam BLOCK_DIM_Y
218
+ //! **[optional]** The thread block length in threads along the Y dimension
219
+ //! (default: 1)
220
+ //!
221
+ //! @tparam BLOCK_DIM_Z
222
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
223
+ //!
224
+ template <typename T,
225
+ int BLOCK_DIM_X,
226
+ BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
227
+ int BLOCK_DIM_Y = 1,
228
+ int BLOCK_DIM_Z = 1>
229
+ class BlockScan
230
+ {
231
+ private:
232
+ /// Constants
233
+ enum
234
+ {
235
+ /// The thread block size in threads
236
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
237
+ };
238
+
239
+ /**
240
+ * Ensure the template parameterization meets the requirements of the
241
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
242
+ * cannot be used with thread block sizes not a multiple of the
243
+ * architectural warp size.
244
+ */
245
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
246
+ ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
247
+ ? BLOCK_SCAN_RAKING
248
+ : ALGORITHM;
249
+
250
+ using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
251
+ using Raking =
252
+ detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
253
+
254
+ /// Define the delegate type for the desired algorithm
255
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
256
+
257
+ /// Shared memory storage layout type for BlockScan
258
+ using _TempStorage = typename InternalBlockScan::TempStorage;
259
+
260
+ /// Shared storage reference
261
+ _TempStorage& temp_storage;
262
+
263
+ /// Linear thread-id
264
+ unsigned int linear_tid;
265
+
266
+ /// Internal storage allocator
267
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
268
+ {
269
+ __shared__ _TempStorage private_storage;
270
+ return private_storage;
271
+ }
272
+
273
+ public:
274
+ /// @smemstorage{BlockScan}
275
+ struct TempStorage : Uninitialized<_TempStorage>
276
+ {};
277
+
278
+ //! @name Collective constructors
279
+ //! @{
280
+
281
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
282
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
283
+ : temp_storage(PrivateStorage())
284
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
285
+ {}
286
+
287
+ /**
288
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
289
+ *
290
+ * @param[in] temp_storage
291
+ * Reference to memory allocation having layout type TempStorage
292
+ */
293
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
294
+ : temp_storage(temp_storage.Alias())
295
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
296
+ {}
297
+
298
+ //! @} end member group
299
+ //! @name Exclusive prefix sum operations
300
+ //! @{
301
+
302
+ //! @rst
303
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
304
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
305
+ //! to ``output`` in *thread*\ :sub:`0`.
306
+ //!
307
+ //! - @identityzero
308
+ //! - @rowmajor
309
+ //! - @smemreuse
310
+ //!
311
+ //! Snippet
312
+ //! +++++++
313
+ //!
314
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
315
+ //! are partitioned across 128 threads.
316
+ //!
317
+ //! .. code-block:: c++
318
+ //!
319
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
320
+ //!
321
+ //! __global__ void ExampleKernel(...)
322
+ //! {
323
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
324
+ //! using BlockScan = cub::BlockScan<int, 128>;
325
+ //!
326
+ //! // Allocate shared memory for BlockScan
327
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
328
+ //!
329
+ //! // Obtain input item for each thread
330
+ //! int thread_data;
331
+ //! ...
332
+ //!
333
+ //! // Collectively compute the block-wide exclusive prefix sum
334
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
335
+ //!
336
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
337
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
338
+ //!
339
+ //! @endrst
340
+ //!
341
+ //! @param[in] input
342
+ //! Calling thread's input item
343
+ //!
344
+ //! @param[out] output
345
+ //! Calling thread's output item (may be aliased to `input`)
346
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
347
+ {
348
+ T initial_value{};
349
+
350
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
351
+ }
352
+
353
+ //! @rst
354
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
355
+ //! Each thread contributes one input element.
356
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
357
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
358
+ //!
359
+ //! - @identityzero
360
+ //! - @rowmajor
361
+ //! - @smemreuse
362
+ //!
363
+ //! Snippet
364
+ //! +++++++
365
+ //!
366
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
367
+ //! are partitioned across 128 threads.
368
+ //!
369
+ //! .. code-block:: c++
370
+ //!
371
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
372
+ //!
373
+ //! __global__ void ExampleKernel(...)
374
+ //! {
375
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
376
+ //! using BlockScan = cub::BlockScan<int, 128>;
377
+ //!
378
+ //! // Allocate shared memory for BlockScan
379
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
380
+ //!
381
+ //! // Obtain input item for each thread
382
+ //! int thread_data;
383
+ //! ...
384
+ //!
385
+ //! // Collectively compute the block-wide exclusive prefix sum
386
+ //! int block_aggregate;
387
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
388
+ //!
389
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
390
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
391
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
392
+ //!
393
+ //! @endrst
394
+ //!
395
+ //! @param[in] input
396
+ //! Calling thread's input item
397
+ //!
398
+ //! @param[out] output
399
+ //! Calling thread's output item (may be aliased to `input`)
400
+ //!
401
+ //! @param[out] block_aggregate
402
+ //! block-wide aggregate reduction of input items
403
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
404
+ {
405
+ T initial_value{};
406
+
407
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
408
+ }
409
+
410
+ //! @rst
411
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
412
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
413
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
414
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
415
+ //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
416
+ //!
417
+ //! - @identityzero
418
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
419
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` is the same value
420
+ //! also returned by the scan operation. The functor will be invoked by the first warp of threads in the block,
421
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
422
+ //! - @rowmajor
423
+ //! - @smemreuse
424
+ //!
425
+ //! Snippet
426
+ //! +++++++
427
+ //!
428
+ //! The code snippet below illustrates a single thread block that progressively
429
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
430
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
431
+ //! of 128 integer items that are partitioned across 128 threads.
432
+ //!
433
+ //! .. code-block:: c++
434
+ //!
435
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
436
+ //!
437
+ //! // A stateful callback functor that maintains a running prefix to be applied
438
+ //! // during consecutive scan operations.
439
+ //! struct BlockPrefixCallbackOp
440
+ //! {
441
+ //! // Running prefix
442
+ //! int running_total;
443
+ //!
444
+ //! // Constructor
445
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
446
+ //!
447
+ //! // Callback operator to be entered by the first warp of threads in the block.
448
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
449
+ //! __host__ int operator()(int block_aggregate)
450
+ //! {
451
+ //! int old_prefix = running_total;
452
+ //! running_total += block_aggregate;
453
+ //! return old_prefix;
454
+ //! }
455
+ //! };
456
+ //!
457
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
458
+ //! {
459
+ //! // Specialize BlockScan for a 1D block of 128 threads
460
+ //! using BlockScan = cub::BlockScan<int, 128>;
461
+ //!
462
+ //! // Allocate shared memory for BlockScan
463
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
464
+ //!
465
+ //! // Initialize running total
466
+ //! BlockPrefixCallbackOp prefix_op(0);
467
+ //!
468
+ //! // Have the block iterate over segments of items
469
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
470
+ //! {
471
+ //! // Load a segment of consecutive items that are blocked across threads
472
+ //! int thread_data = d_data[block_offset];
473
+ //!
474
+ //! // Collectively compute the block-wide exclusive prefix sum
475
+ //! BlockScan(temp_storage).ExclusiveSum(
476
+ //! thread_data, thread_data, prefix_op);
477
+ //! __syncthreads();
478
+ //!
479
+ //! // Store scanned items to output segment
480
+ //! d_data[block_offset] = thread_data;
481
+ //! }
482
+ //!
483
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
484
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
485
+ //! The output for the second segment will be ``128, 129, ..., 255``.
486
+ //!
487
+ //! @endrst
488
+ //!
489
+ //! @tparam BlockPrefixCallbackOp
490
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
491
+ //!
492
+ //! @param[in] input
493
+ //! Calling thread's input item
494
+ //!
495
+ //! @param[out] output
496
+ //! Calling thread's output item (may be aliased to `input`)
497
+ //!
498
+ //! @param[in,out] block_prefix_callback_op
499
+ //! @rst
500
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
501
+ //! the logical input sequence.
502
+ //! @endrst
503
+ template <typename BlockPrefixCallbackOp>
504
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
505
+ {
506
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
507
+ }
508
+
509
+ //! @} end member group
510
+ //! @name Exclusive prefix sum operations (multiple data per thread)
511
+ //! @{
512
+
513
+ //! @rst
514
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
515
+ //! Each thread contributes an array of consecutive input elements.
516
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
517
+ //!
518
+ //! - @identityzero
519
+ //! - @blocked
520
+ //! - @granularity
521
+ //! - @smemreuse
522
+ //!
523
+ //! Snippet
524
+ //! +++++++
525
+ //!
526
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
527
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
528
+ //! where each thread owns 4 consecutive items.
529
+ //!
530
+ //! .. code-block:: c++
531
+ //!
532
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
533
+ //!
534
+ //! __global__ void ExampleKernel(...)
535
+ //! {
536
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
537
+ //! using BlockScan = cub::BlockScan<int, 128>;
538
+ //!
539
+ //! // Allocate shared memory for BlockScan
540
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
541
+ //!
542
+ //! // Obtain a segment of consecutive items that are blocked across threads
543
+ //! int thread_data[4];
544
+ //! ...
545
+ //!
546
+ //! // Collectively compute the block-wide exclusive prefix sum
547
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
548
+ //!
549
+ //! Suppose the set of input ``thread_data`` across the block of threads is
550
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
551
+ //! The corresponding output ``thread_data`` in those threads will be
552
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
553
+ //!
554
+ //! @endrst
555
+ //!
556
+ //! @tparam ITEMS_PER_THREAD
557
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
558
+ //!
559
+ //! @param[in] input
560
+ //! Calling thread's input items
561
+ //!
562
+ //! @param[out] output
563
+ //! Calling thread's output items (may be aliased to `input`)
564
+ template <int ITEMS_PER_THREAD>
565
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
566
+ {
567
+ T initial_value{};
568
+
569
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
570
+ }
571
+
572
+ //! @rst
573
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
574
+ //! Each thread contributes an array of consecutive input elements.
575
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
576
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
577
+ //!
578
+ //! - @identityzero
579
+ //! - @blocked
580
+ //! - @granularity
581
+ //! - @smemreuse
582
+ //!
583
+ //! Snippet
584
+ //! +++++++
585
+ //!
586
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
587
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
588
+ //! 4 consecutive items.
589
+ //!
590
+ //! .. code-block:: c++
591
+ //!
592
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
593
+ //!
594
+ //! __global__ void ExampleKernel(...)
595
+ //! {
596
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
597
+ //! using BlockScan = cub::BlockScan<int, 128>;
598
+ //!
599
+ //! // Allocate shared memory for BlockScan
600
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
601
+ //!
602
+ //! // Obtain a segment of consecutive items that are blocked across threads
603
+ //! int thread_data[4];
604
+ //! ...
605
+ //!
606
+ //! // Collectively compute the block-wide exclusive prefix sum
607
+ //! int block_aggregate;
608
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
609
+ //!
610
+ //! Suppose the set of input ``thread_data`` across the block of threads is
611
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
612
+ //! The corresponding output ``thread_data`` in those threads will be
613
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
614
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
615
+ //!
616
+ //! @endrst
617
+ //!
618
+ //! @tparam ITEMS_PER_THREAD
619
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
620
+ //!
621
+ //! @param[in] input
622
+ //! Calling thread's input items
623
+ //!
624
+ //! @param[out] output
625
+ //! Calling thread's output items (may be aliased to `input`)
626
+ //!
627
+ //! @param[out] block_aggregate
628
+ //! block-wide aggregate reduction of input items
629
+ template <int ITEMS_PER_THREAD>
630
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
631
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
632
+ {
633
+ // Reduce consecutive thread items in registers
634
+ T initial_value{};
635
+
636
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
637
+ }
638
+
639
+ //! @rst
640
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
641
+ //! Each thread contributes an array of consecutive input elements.
642
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
643
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
644
+ //! value that logically prefixes the thread block's scan inputs.
645
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
646
+ //!
647
+ //! - @identityzero
648
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
649
+ //! The functor's input parameter ``block_aggregate`` is the same value also returned
650
+ //! by the scan operation. The functor will be invoked by the first warp of threads in
651
+ //! the block, however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix.
652
+ //! Can be stateful.
653
+ //! - @blocked
654
+ //! - @granularity
655
+ //! - @smemreuse
656
+ //!
657
+ //!
658
+ //! Snippet
659
+ //! +++++++
660
+ //!
661
+ //! The code snippet below illustrates a single thread block that progressively
662
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
663
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
664
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
665
+ //! across 128 threads where each thread owns 4 consecutive items.
666
+ //!
667
+ //! .. code-block:: c++
668
+ //!
669
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
670
+ //!
671
+ //! // A stateful callback functor that maintains a running prefix to be applied
672
+ //! // during consecutive scan operations.
673
+ //! struct BlockPrefixCallbackOp
674
+ //! {
675
+ //! // Running prefix
676
+ //! int running_total;
677
+ //!
678
+ //! // Constructor
679
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
680
+ //!
681
+ //! // Callback operator to be entered by the first warp of threads in the block.
682
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
683
+ //! __host__ int operator()(int block_aggregate)
684
+ //! {
685
+ //! int old_prefix = running_total;
686
+ //! running_total += block_aggregate;
687
+ //! return old_prefix;
688
+ //! }
689
+ //! };
690
+ //!
691
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
692
+ //! {
693
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
694
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>;
695
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>;
696
+ //! using BlockScan = cub::BlockScan<int, 128>;
697
+ //!
698
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
699
+ //! __shared__ union {
700
+ //! typename BlockLoad::TempStorage load;
701
+ //! typename BlockScan::TempStorage scan;
702
+ //! typename BlockStore::TempStorage store;
703
+ //! } temp_storage;
704
+ //!
705
+ //! // Initialize running total
706
+ //! BlockPrefixCallbackOp prefix_op(0);
707
+ //!
708
+ //! // Have the block iterate over segments of items
709
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
710
+ //! {
711
+ //! // Load a segment of consecutive items that are blocked across threads
712
+ //! int thread_data[4];
713
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
714
+ //! __syncthreads();
715
+ //!
716
+ //! // Collectively compute the block-wide exclusive prefix sum
717
+ //! int block_aggregate;
718
+ //! BlockScan(temp_storage.scan).ExclusiveSum(
719
+ //! thread_data, thread_data, prefix_op);
720
+ //! __syncthreads();
721
+ //!
722
+ //! // Store scanned items to output segment
723
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
724
+ //! __syncthreads();
725
+ //! }
726
+ //!
727
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
728
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
729
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
730
+ //!
731
+ //! @endrst
732
+ //!
733
+ //! @tparam ITEMS_PER_THREAD
734
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
735
+ //!
736
+ //! @tparam BlockPrefixCallbackOp
737
+ //! **[inferred]** Call-back functor type having member
738
+ //! `T operator()(T block_aggregate)`
739
+ //!
740
+ //! @param[in] input
741
+ //! Calling thread's input items
742
+ //!
743
+ //! @param[out] output
744
+ //! Calling thread's output items (may be aliased to `input`)
745
+ //!
746
+ //! @param[in,out] block_prefix_callback_op
747
+ //! @rst
748
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
749
+ //! the logical input sequence.
750
+ //! @endrst
751
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
752
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
753
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
754
+ {
755
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
756
+ }
757
+
758
+ //! @} end member group // Exclusive prefix sums
759
+ //! @name Exclusive prefix scan operations
760
+ //! @{
761
+
762
+ //! @rst
763
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
764
+ //! Each thread contributes one input element.
765
+ //!
766
+ //! - Supports non-commutative scan operators.
767
+ //! - @rowmajor
768
+ //! - @smemreuse
769
+ //!
770
+ //! Snippet
771
+ //! +++++++
772
+ //!
773
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
774
+ //! are partitioned across 128 threads.
775
+ //!
776
+ //! .. code-block:: c++
777
+ //!
778
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
779
+ //!
780
+ //! __global__ void ExampleKernel(...)
781
+ //! {
782
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
783
+ //! using BlockScan = cub::BlockScan<int, 128>;
784
+ //!
785
+ //! // Allocate shared memory for BlockScan
786
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
787
+ //!
788
+ //! // Obtain input item for each thread
789
+ //! int thread_data;
790
+ //! ...
791
+ //!
792
+ //! // Collectively compute the block-wide exclusive prefix max scan
793
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
794
+ //!
795
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
796
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
797
+ //!
798
+ //! @endrst
799
+ //!
800
+ //! @tparam ScanOp
801
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
802
+ //!
803
+ //! @param[in] input
804
+ //! Calling thread's input item
805
+ //!
806
+ //! @param[out] output
807
+ //! Calling thread's output item (may be aliased to `input`)
808
+ //!
809
+ //! @param[in] initial_value
810
+ //! @rst
811
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
812
+ //! @endrst
813
+ //!
814
+ //! @param[in] scan_op
815
+ //! Binary scan functor
816
+ template <typename ScanOp>
817
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
818
+ {
819
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
820
+ }
821
+
822
+ //! @rst
823
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
824
+ //! Each thread contributes one input element.
825
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
826
+ //!
827
+ //! - Supports non-commutative scan operators.
828
+ //! - @rowmajor
829
+ //! - @smemreuse
830
+ //!
831
+ //! Snippet
832
+ //! +++++++
833
+ //!
834
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
835
+ //! are partitioned across 128 threads.
836
+ //!
837
+ //! .. code-block:: c++
838
+ //!
839
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
840
+ //!
841
+ //! __global__ void ExampleKernel(...)
842
+ //! {
843
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
844
+ //! using BlockScan = cub::BlockScan<int, 128>;
845
+ //!
846
+ //! // Allocate shared memory for BlockScan
847
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
848
+ //!
849
+ //! // Obtain input item for each thread
850
+ //! int thread_data;
851
+ //! ...
852
+ //!
853
+ //! // Collectively compute the block-wide exclusive prefix max scan
854
+ //! int block_aggregate;
855
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{},
856
+ //! block_aggregate);
857
+ //!
858
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
859
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
860
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
861
+ //!
862
+ //! .. note::
863
+ //!
864
+ //! ``initial_value`` is not applied to the block-wide aggregate.
865
+ //!
866
+ //! @endrst
867
+ //!
868
+ //! @tparam ScanOp
869
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
870
+ //!
871
+ //! @param[in] input
872
+ //! Calling thread's input items
873
+ //!
874
+ //! @param[out] output
875
+ //! Calling thread's output items (may be aliased to ``input``)
876
+ //!
877
+ //! @param[in] initial_value
878
+ //! @rst
879
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
880
+ //! taken into account for ``block_aggregate``.
881
+ //!
882
+ //! @endrst
883
+ //!
884
+ //! @param[in] scan_op
885
+ //! Binary scan functor
886
+ //!
887
+ //! @param[out] block_aggregate
888
+ //! block-wide aggregate reduction of input items
889
+ template <typename ScanOp>
890
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
891
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
892
+ {
893
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
894
+ }
895
+
896
+ //! @rst
897
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
898
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
899
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
900
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
901
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
902
+ //!
903
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
904
+ //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation.
905
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
906
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
907
+ //! - Supports non-commutative scan operators.
908
+ //! - @rowmajor
909
+ //! - @smemreuse
910
+ //!
911
+ //! Snippet
912
+ //! +++++++
913
+ //!
914
+ //! The code snippet below illustrates a single thread block that progressively
915
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
916
+ //! prefix functor to maintain a running total between block-wide scans.
917
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
918
+ //!
919
+ //! .. code-block:: c++
920
+ //!
921
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
922
+ //!
923
+ //! // A stateful callback functor that maintains a running prefix to be applied
924
+ //! // during consecutive scan operations.
925
+ //! struct BlockPrefixCallbackOp
926
+ //! {
927
+ //! // Running prefix
928
+ //! int running_total;
929
+ //!
930
+ //! // Constructor
931
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
932
+ //!
933
+ //! // Callback operator to be entered by the first warp of threads in the block.
934
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
935
+ //! __host__ int operator()(int block_aggregate)
936
+ //! {
937
+ //! int old_prefix = running_total;
938
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
939
+ //! return old_prefix;
940
+ //! }
941
+ //! };
942
+ //!
943
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
944
+ //! {
945
+ //! // Specialize BlockScan for a 1D block of 128 threads
946
+ //! using BlockScan = cub::BlockScan<int, 128>;
947
+ //!
948
+ //! // Allocate shared memory for BlockScan
949
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
950
+ //!
951
+ //! // Initialize running total
952
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
953
+ //!
954
+ //! // Have the block iterate over segments of items
955
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
956
+ //! {
957
+ //! // Load a segment of consecutive items that are blocked across threads
958
+ //! int thread_data = d_data[block_offset];
959
+ //!
960
+ //! // Collectively compute the block-wide exclusive prefix max scan
961
+ //! BlockScan(temp_storage).ExclusiveScan(
962
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
963
+ //! __syncthreads();
964
+ //!
965
+ //! // Store scanned items to output segment
966
+ //! d_data[block_offset] = thread_data;
967
+ //! }
968
+ //!
969
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
970
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
971
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
972
+ //!
973
+ //! @endrst
974
+ //!
975
+ //! @tparam ScanOp
976
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
977
+ //!
978
+ //! @tparam BlockPrefixCallbackOp
979
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
980
+ //!
981
+ //! @param[in] input
982
+ //! Calling thread's input item
983
+ //!
984
+ //! @param[out] output
985
+ //! Calling thread's output item (may be aliased to `input`)
986
+ //!
987
+ //! @param[in] scan_op
988
+ //! Binary scan functor
989
+ //!
990
+ //! @param[in,out] block_prefix_callback_op
991
+ //! @rst
992
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
993
+ //! the logical input sequence.
994
+ //! @endrst
995
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
996
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
997
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
998
+ {
999
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
1000
+ }
1001
+
1002
+ //! @} end member group // Inclusive prefix sums
1003
+ //! @name Exclusive prefix scan operations (multiple data per thread)
1004
+ //! @{
1005
+
1006
+ //! @rst
1007
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1008
+ //! Each thread contributes an array of consecutive input elements.
1009
+ //!
1010
+ //! - Supports non-commutative scan operators.
1011
+ //! - @blocked
1012
+ //! - @granularity
1013
+ //! - @smemreuse
1014
+ //!
1015
+ //! Snippet
1016
+ //! +++++++
1017
+ //!
1018
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
1019
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
1020
+ //! across 128 threads where each thread owns 4 consecutive items.
1021
+ //!
1022
+ //! .. code-block:: c++
1023
+ //!
1024
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1025
+ //!
1026
+ //! __global__ void ExampleKernel(...)
1027
+ //! {
1028
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1029
+ //! using BlockScan = cub::BlockScan<int, 128>;
1030
+ //!
1031
+ //! // Allocate shared memory for BlockScan
1032
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1033
+ //!
1034
+ //! // Obtain a segment of consecutive items that are blocked across threads
1035
+ //! int thread_data[4];
1036
+ //! ...
1037
+ //!
1038
+ //! // Collectively compute the block-wide exclusive prefix max scan
1039
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
1040
+ //!
1041
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1042
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1043
+ //! The corresponding output ``thread_data`` in those threads will be
1044
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1045
+ //!
1046
+ //! @endrst
1047
+ //!
1048
+ //! @tparam ITEMS_PER_THREAD
1049
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1050
+ //!
1051
+ //! @tparam ScanOp
1052
+ //! **[inferred]** Binary scan functor type having member
1053
+ //! `T operator()(const T &a, const T &b)`
1054
+ //!
1055
+ //! @param[in] input
1056
+ //! Calling thread's input items
1057
+ //!
1058
+ //! @param[out] output
1059
+ //! Calling thread's output items (may be aliased to `input`)
1060
+ //!
1061
+ //! @param[in] initial_value
1062
+ //! @rst
1063
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
1064
+ //! @endrst
1065
+ //!
1066
+ //! @param[in] scan_op
1067
+ //! Binary scan functor
1068
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1069
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1070
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
1071
+ {
1072
+ // Reduce consecutive thread items in registers
1073
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1074
+
1075
+ // Exclusive thread block-scan
1076
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
1077
+
1078
+ // Exclusive scan in registers with prefix as seed
1079
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1080
+ }
1081
+
1082
+ //! @rst
1083
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1084
+ //! Each thread contributes an array of consecutive input elements.
1085
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1086
+ //!
1087
+ //! - Supports non-commutative scan operators.
1088
+ //! - @blocked
1089
+ //! - @granularity
1090
+ //! - @smemreuse
1091
+ //!
1092
+ //! Snippet
1093
+ //! +++++++
1094
+ //!
1095
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
1096
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
1097
+ //! 4 consecutive items.
1098
+ //!
1099
+ //! .. code-block:: c++
1100
+ //!
1101
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1102
+ //!
1103
+ //! __global__ void ExampleKernel(...)
1104
+ //! {
1105
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1106
+ //! using BlockScan = cub::BlockScan<int, 128>;
1107
+ //!
1108
+ //! // Allocate shared memory for BlockScan
1109
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1110
+ //!
1111
+ //! // Obtain a segment of consecutive items that are blocked across threads
1112
+ //! int thread_data[4];
1113
+ //! ...
1114
+ //!
1115
+ //! // Collectively compute the block-wide exclusive prefix max scan
1116
+ //! int block_aggregate;
1117
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{},
1118
+ //! block_aggregate);
1119
+ //!
1120
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1121
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1122
+ //! The corresponding output ``thread_data`` in those threads will be
1123
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1124
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
1125
+ //!
1126
+ //! .. note::
1127
+ //!
1128
+ //! ``initial_value`` is not applied to the block-wide aggregate.
1129
+ //!
1130
+ //! @endrst
1131
+ //!
1132
+ //! @tparam ITEMS_PER_THREAD
1133
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1134
+ //!
1135
+ //! @tparam ScanOp
1136
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1137
+ //!
1138
+ //! @param[in] input
1139
+ //! Calling thread's input items
1140
+ //!
1141
+ //! @param[out] output
1142
+ //! Calling thread's output items (may be aliased to `input`)
1143
+ //!
1144
+ //! @param[in] initial_value
1145
+ //! @rst
1146
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
1147
+ //! into account for ``block_aggregate``.
1148
+ //! @endrst
1149
+ //!
1150
+ //! @param[in] scan_op
1151
+ //! Binary scan functor
1152
+ //!
1153
+ //! @param[out] block_aggregate
1154
+ //! block-wide aggregate reduction of input items
1155
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1156
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1157
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
1158
+ {
1159
+ // Reduce consecutive thread items in registers
1160
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1161
+
1162
+ // Exclusive thread block-scan
1163
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1164
+
1165
+ // Exclusive scan in registers with prefix as seed
1166
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1167
+ }
1168
+
1169
+ //! @rst
1170
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1171
+ //! Each thread contributes an array of consecutive input elements.
1172
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
1173
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
1174
+ //! block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1175
+ //!
1176
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1177
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate``
1178
+ //! is the same value also returned by the scan operation. The functor will be invoked by the
1179
+ //! first warp of threads in the block, however only the return value from
1180
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1181
+ //! - Supports non-commutative scan operators.
1182
+ //! - @blocked
1183
+ //! - @granularity
1184
+ //! - @smemreuse
1185
+ //!
1186
+ //! Snippet
1187
+ //! +++++++
1188
+ //!
1189
+ //! The code snippet below illustrates a single thread block that progressively
1190
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
1191
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1192
+ //! of 128 integer items that are partitioned across 128 threads.
1193
+ //!
1194
+ //! .. code-block:: c++
1195
+ //!
1196
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1197
+ //!
1198
+ //! // A stateful callback functor that maintains a running prefix to be applied
1199
+ //! // during consecutive scan operations.
1200
+ //! struct BlockPrefixCallbackOp
1201
+ //! {
1202
+ //! // Running prefix
1203
+ //! int running_total;
1204
+ //!
1205
+ //! // Constructor
1206
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1207
+ //!
1208
+ //! // Callback operator to be entered by the first warp of threads in the block.
1209
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1210
+ //! __host__ int operator()(int block_aggregate)
1211
+ //! {
1212
+ //! int old_prefix = running_total;
1213
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
1214
+ //! return old_prefix;
1215
+ //! }
1216
+ //! };
1217
+ //!
1218
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1219
+ //! {
1220
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1221
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1222
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1223
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1224
+ //!
1225
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1226
+ //! __shared__ union {
1227
+ //! typename BlockLoad::TempStorage load;
1228
+ //! typename BlockScan::TempStorage scan;
1229
+ //! typename BlockStore::TempStorage store;
1230
+ //! } temp_storage;
1231
+ //!
1232
+ //! // Initialize running total
1233
+ //! BlockPrefixCallbackOp prefix_op(0);
1234
+ //!
1235
+ //! // Have the block iterate over segments of items
1236
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1237
+ //! {
1238
+ //! // Load a segment of consecutive items that are blocked across threads
1239
+ //! int thread_data[4];
1240
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1241
+ //! __syncthreads();
1242
+ //!
1243
+ //! // Collectively compute the block-wide exclusive prefix max scan
1244
+ //! BlockScan(temp_storage.scan).ExclusiveScan(
1245
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
1246
+ //! __syncthreads();
1247
+ //!
1248
+ //! // Store scanned items to output segment
1249
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1250
+ //! __syncthreads();
1251
+ //! }
1252
+ //!
1253
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1254
+ //! The corresponding output for the first segment will be
1255
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
1256
+ //! The output for the second segment will be
1257
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1258
+ //!
1259
+ //! @endrst
1260
+ //!
1261
+ //! @tparam ITEMS_PER_THREAD
1262
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1263
+ //!
1264
+ //! @tparam ScanOp
1265
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1266
+ //!
1267
+ //! @tparam BlockPrefixCallbackOp
1268
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1269
+ //!
1270
+ //! @param[in] input
1271
+ //! Calling thread's input items
1272
+ //!
1273
+ //! @param[out] output
1274
+ //! Calling thread's output items (may be aliased to `input`)
1275
+ //!
1276
+ //! @param[in] scan_op
1277
+ //! Binary scan functor
1278
+ //!
1279
+ //! @param[in,out] block_prefix_callback_op
1280
+ //! @rst
1281
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1282
+ //! the logical input sequence.
1283
+ //! @endrst
1284
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1285
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1286
+ T (&input)[ITEMS_PER_THREAD],
1287
+ T (&output)[ITEMS_PER_THREAD],
1288
+ ScanOp scan_op,
1289
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1290
+ {
1291
+ // Reduce consecutive thread items in registers
1292
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1293
+
1294
+ // Exclusive thread block-scan
1295
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1296
+
1297
+ // Exclusive scan in registers with prefix as seed
1298
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1299
+ }
1300
+
1301
+ //! @} end member group
1302
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1303
+
1304
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1305
+ //! @{
1306
+
1307
+ //! @rst
1308
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1309
+ //! Each thread contributes one input element.
1310
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1311
+ //!
1312
+ //! - Supports non-commutative scan operators.
1313
+ //! - @rowmajor
1314
+ //! - @smemreuse
1315
+ //!
1316
+ //! @endrst
1317
+ //!
1318
+ //! @tparam ScanOp
1319
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1320
+ //!
1321
+ //! @param[in] input
1322
+ //! Calling thread's input item
1323
+ //!
1324
+ //! @param[out] output
1325
+ //! Calling thread's output item (may be aliased to `input`)
1326
+ //!
1327
+ //! @param[in] scan_op
1328
+ //! Binary scan functor
1329
+ template <typename ScanOp>
1330
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1331
+ {
1332
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1333
+ }
1334
+
1335
+ //! @rst
1336
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1337
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1338
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1339
+ //! *thread*\ :sub:`0` is undefined.
1340
+ //!
1341
+ //! - Supports non-commutative scan operators.
1342
+ //! - @rowmajor
1343
+ //! - @smemreuse
1344
+ //!
1345
+ //! @endrst
1346
+ //!
1347
+ //! @tparam ScanOp
1348
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1349
+ //!
1350
+ //! @param[in] input
1351
+ //! Calling thread's input item
1352
+ //!
1353
+ //! @param[out] output
1354
+ //! Calling thread's output item (may be aliased to `input`)
1355
+ //!
1356
+ //! @param[in] scan_op
1357
+ //! Binary scan functor
1358
+ //!
1359
+ //! @param[out] block_aggregate
1360
+ //! block-wide aggregate reduction of input items
1361
+ template <typename ScanOp>
1362
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1363
+ {
1364
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1365
+ }
1366
+
1367
+ //! @} end member group
1368
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1369
+ //! @{
1370
+
1371
+ //! @rst
1372
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1373
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1374
+ //! output computed for *thread*\ :sub:`0` is undefined.
1375
+ //!
1376
+ //! - Supports non-commutative scan operators.
1377
+ //! - @blocked
1378
+ //! - @granularity
1379
+ //! - @smemreuse
1380
+ //!
1381
+ //! @endrst
1382
+ //!
1383
+ //! @tparam ITEMS_PER_THREAD
1384
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1385
+ //!
1386
+ //! @tparam ScanOp
1387
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1388
+ //!
1389
+ //! @param[in] input
1390
+ //! Calling thread's input items
1391
+ //!
1392
+ //! @param[out] output
1393
+ //! Calling thread's output items (may be aliased to `input`)
1394
+ //!
1395
+ //! @param[in] scan_op
1396
+ //! Binary scan functor
1397
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1398
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1399
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1400
+ {
1401
+ // Reduce consecutive thread items in registers
1402
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1403
+
1404
+ // Exclusive thread block-scan
1405
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1406
+
1407
+ // Exclusive scan in registers with prefix
1408
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1409
+ }
1410
+
1411
+ //! @rst
1412
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1413
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1414
+ //! with the block-wide ``block_aggregate`` of all inputs.
1415
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1416
+ //!
1417
+ //! - Supports non-commutative scan operators.
1418
+ //! - @blocked
1419
+ //! - @granularity
1420
+ //! - @smemreuse
1421
+ //!
1422
+ //! @endrst
1423
+ //!
1424
+ //! @tparam ITEMS_PER_THREAD
1425
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1426
+ //!
1427
+ //! @tparam ScanOp
1428
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1429
+ //!
1430
+ //! @param[in] input
1431
+ //! Calling thread's input items
1432
+ //!
1433
+ //! @param[out] output
1434
+ //! Calling thread's output items (may be aliased to `input`)
1435
+ //!
1436
+ //! @param[in] scan_op
1437
+ //! Binary scan functor
1438
+ //!
1439
+ //! @param[out] block_aggregate
1440
+ //! block-wide aggregate reduction of input items
1441
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1442
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1443
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1444
+ {
1445
+ // Reduce consecutive thread items in registers
1446
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1447
+
1448
+ // Exclusive thread block-scan
1449
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1450
+
1451
+ // Exclusive scan in registers with prefix
1452
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1453
+ }
1454
+
1455
+ //! @} end member group
1456
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1457
+
1458
+ //! @name Inclusive prefix sum operations
1459
+ //! @{
1460
+
1461
+ //! @rst
1462
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1463
+ //! as the scan operator. Each thread contributes one input element.
1464
+ //!
1465
+ //! - @rowmajor
1466
+ //! - @smemreuse
1467
+ //!
1468
+ //! Snippet
1469
+ //! +++++++
1470
+ //!
1471
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1472
+ //! are partitioned across 128 threads.
1473
+ //!
1474
+ //! .. code-block:: c++
1475
+ //!
1476
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1477
+ //!
1478
+ //! __global__ void ExampleKernel(...)
1479
+ //! {
1480
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1481
+ //! using BlockScan = cub::BlockScan<int, 128>;
1482
+ //!
1483
+ //! // Allocate shared memory for BlockScan
1484
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1485
+ //!
1486
+ //! // Obtain input item for each thread
1487
+ //! int thread_data;
1488
+ //! ...
1489
+ //!
1490
+ //! // Collectively compute the block-wide inclusive prefix sum
1491
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1492
+ //!
1493
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1494
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1495
+ //!
1496
+ //! @endrst
1497
+ //!
1498
+ //! @param[in] input
1499
+ //! Calling thread's input item
1500
+ //!
1501
+ //! @param[out] output
1502
+ //! Calling thread's output item (may be aliased to `input`)
1503
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1504
+ {
1505
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1506
+ }
1507
+
1508
+ //! @rst
1509
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1510
+ //! Each thread contributes one input element.
1511
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1512
+ //!
1513
+ //! - @rowmajor
1514
+ //! - @smemreuse
1515
+ //!
1516
+ //! Snippet
1517
+ //! +++++++
1518
+ //!
1519
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1520
+ //! are partitioned across 128 threads.
1521
+ //!
1522
+ //! .. code-block:: c++
1523
+ //!
1524
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1525
+ //!
1526
+ //! __global__ void ExampleKernel(...)
1527
+ //! {
1528
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1529
+ //! using BlockScan = cub::BlockScan<int, 128>;
1530
+ //!
1531
+ //! // Allocate shared memory for BlockScan
1532
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1533
+ //!
1534
+ //! // Obtain input item for each thread
1535
+ //! int thread_data;
1536
+ //! ...
1537
+ //!
1538
+ //! // Collectively compute the block-wide inclusive prefix sum
1539
+ //! int block_aggregate;
1540
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1541
+ //!
1542
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1543
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1544
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1545
+ //!
1546
+ //! @endrst
1547
+ //!
1548
+ //! @param[in] input
1549
+ //! Calling thread's input item
1550
+ //!
1551
+ //! @param[out] output
1552
+ //! Calling thread's output item (may be aliased to `input`)
1553
+ //!
1554
+ //! @param[out] block_aggregate
1555
+ //! block-wide aggregate reduction of input items
1556
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1557
+ {
1558
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1559
+ }
1560
+
1561
+ //! @rst
1562
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1563
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1564
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1565
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1566
+ //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1567
+ //!
1568
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1569
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
1570
+ //! ``block_aggregate`` is the same value also returned by the scan operation.
1571
+ //! The functor will be invoked by the first warp of threads in the block,
1572
+ //! however only the return value from *lane*\ :sub:`0` is applied
1573
+ //! as the block-wide prefix. Can be stateful.
1574
+ //! - @rowmajor
1575
+ //! - @smemreuse
1576
+ //!
1577
+ //! Snippet
1578
+ //! +++++++
1579
+ //!
1580
+ //! The code snippet below illustrates a single thread block that progressively
1581
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1582
+ //! prefix functor to maintain a running total between block-wide scans.
1583
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1584
+ //!
1585
+ //! .. code-block:: c++
1586
+ //!
1587
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1588
+ //!
1589
+ //! // A stateful callback functor that maintains a running prefix to be applied
1590
+ //! // during consecutive scan operations.
1591
+ //! struct BlockPrefixCallbackOp
1592
+ //! {
1593
+ //! // Running prefix
1594
+ //! int running_total;
1595
+ //!
1596
+ //! // Constructor
1597
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1598
+ //!
1599
+ //! // Callback operator to be entered by the first warp of threads in the block.
1600
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1601
+ //! __host__ int operator()(int block_aggregate)
1602
+ //! {
1603
+ //! int old_prefix = running_total;
1604
+ //! running_total += block_aggregate;
1605
+ //! return old_prefix;
1606
+ //! }
1607
+ //! };
1608
+ //!
1609
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1610
+ //! {
1611
+ //! // Specialize BlockScan for a 1D block of 128 threads
1612
+ //! using BlockScan = cub::BlockScan<int, 128>;
1613
+ //!
1614
+ //! // Allocate shared memory for BlockScan
1615
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1616
+ //!
1617
+ //! // Initialize running total
1618
+ //! BlockPrefixCallbackOp prefix_op(0);
1619
+ //!
1620
+ //! // Have the block iterate over segments of items
1621
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1622
+ //! {
1623
+ //! // Load a segment of consecutive items that are blocked across threads
1624
+ //! int thread_data = d_data[block_offset];
1625
+ //!
1626
+ //! // Collectively compute the block-wide inclusive prefix sum
1627
+ //! BlockScan(temp_storage).InclusiveSum(
1628
+ //! thread_data, thread_data, prefix_op);
1629
+ //! __syncthreads();
1630
+ //!
1631
+ //! // Store scanned items to output segment
1632
+ //! d_data[block_offset] = thread_data;
1633
+ //! }
1634
+ //!
1635
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1636
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1637
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1638
+ //!
1639
+ //! @endrst
1640
+ //!
1641
+ //! @tparam BlockPrefixCallbackOp
1642
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1643
+ //!
1644
+ //! @param[in] input
1645
+ //! Calling thread's input item
1646
+ //!
1647
+ //! @param[out] output
1648
+ //! Calling thread's output item (may be aliased to `input`)
1649
+ //!
1650
+ //! @param[in,out] block_prefix_callback_op
1651
+ //! @rst
1652
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1653
+ //! to the logical input sequence.
1654
+ //! @endrst
1655
+ template <typename BlockPrefixCallbackOp>
1656
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1657
+ {
1658
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1659
+ }
1660
+
1661
+ //! @} end member group
1662
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1663
+ //! @{
1664
+
1665
+ //! @rst
1666
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1667
+ //! Each thread contributes an array of consecutive input elements.
1668
+ //!
1669
+ //! - @blocked
1670
+ //! - @granularity
1671
+ //! - @smemreuse
1672
+ //!
1673
+ //! Snippet
1674
+ //! +++++++
1675
+ //!
1676
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1677
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1678
+ //! where each thread owns 4 consecutive items.
1679
+ //!
1680
+ //! .. code-block:: c++
1681
+ //!
1682
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1683
+ //!
1684
+ //! __global__ void ExampleKernel(...)
1685
+ //! {
1686
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1687
+ //! using BlockScan = cub::BlockScan<int, 128>;
1688
+ //!
1689
+ //! // Allocate shared memory for BlockScan
1690
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1691
+ //!
1692
+ //! // Obtain a segment of consecutive items that are blocked across threads
1693
+ //! int thread_data[4];
1694
+ //! ...
1695
+ //!
1696
+ //! // Collectively compute the block-wide inclusive prefix sum
1697
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1698
+ //!
1699
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1700
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1701
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1702
+ //!
1703
+ //! @endrst
1704
+ //!
1705
+ //! @tparam ITEMS_PER_THREAD
1706
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1707
+ //!
1708
+ //! @param[in] input
1709
+ //! Calling thread's input items
1710
+ //!
1711
+ //! @param[out] output
1712
+ //! Calling thread's output items (may be aliased to `input`)
1713
+ template <int ITEMS_PER_THREAD>
1714
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1715
+ {
1716
+ if (ITEMS_PER_THREAD == 1)
1717
+ {
1718
+ InclusiveSum(input[0], output[0]);
1719
+ }
1720
+ else
1721
+ {
1722
+ // Reduce consecutive thread items in registers
1723
+ ::cuda::std::plus<> scan_op;
1724
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1725
+
1726
+ // Exclusive thread block-scan
1727
+ ExclusiveSum(thread_prefix, thread_prefix);
1728
+
1729
+ // Inclusive scan in registers with prefix as seed
1730
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1731
+ }
1732
+ }
1733
+
1734
+ //! @rst
1735
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1736
+ //! Each thread contributes an array of consecutive input elements.
1737
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1738
+ //!
1739
+ //! - @blocked
1740
+ //! - @granularity
1741
+ //! - @smemreuse
1742
+ //!
1743
+ //! Snippet
1744
+ //! +++++++
1745
+ //!
1746
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1747
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1748
+ //! where each thread owns 4 consecutive items.
1749
+ //!
1750
+ //! .. code-block:: c++
1751
+ //!
1752
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1753
+ //!
1754
+ //! __global__ void ExampleKernel(...)
1755
+ //! {
1756
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1757
+ //! using BlockScan = cub::BlockScan<int, 128>;
1758
+ //!
1759
+ //! // Allocate shared memory for BlockScan
1760
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1761
+ //!
1762
+ //! // Obtain a segment of consecutive items that are blocked across threads
1763
+ //! int thread_data[4];
1764
+ //! ...
1765
+ //!
1766
+ //! // Collectively compute the block-wide inclusive prefix sum
1767
+ //! int block_aggregate;
1768
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1769
+ //!
1770
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1771
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1772
+ //! corresponding output ``thread_data`` in those threads will be
1773
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1774
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1775
+ //!
1776
+ //! @endrst
1777
+ //!
1778
+ //! @tparam ITEMS_PER_THREAD
1779
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1780
+ //!
1781
+ //! @param[in] input
1782
+ //! Calling thread's input items
1783
+ //!
1784
+ //! @param[out] output
1785
+ //! Calling thread's output items (may be aliased to `input`)
1786
+ //!
1787
+ //! @param[out] block_aggregate
1788
+ //! block-wide aggregate reduction of input items
1789
+ template <int ITEMS_PER_THREAD>
1790
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1791
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1792
+ {
1793
+ if (ITEMS_PER_THREAD == 1)
1794
+ {
1795
+ InclusiveSum(input[0], output[0], block_aggregate);
1796
+ }
1797
+ else
1798
+ {
1799
+ // Reduce consecutive thread items in registers
1800
+ ::cuda::std::plus<> scan_op;
1801
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1802
+
1803
+ // Exclusive thread block-scan
1804
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1805
+
1806
+ // Inclusive scan in registers with prefix as seed
1807
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1808
+ }
1809
+ }
1810
+
1811
+ //! @rst
1812
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1813
+ //! Each thread contributes an array of consecutive input elements.
1814
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1815
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1816
+ //! value that logically prefixes the thread block's scan inputs. Also provides every thread with the
1817
+ //! block-wide ``block_aggregate`` of all inputs.
1818
+ //!
1819
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1820
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
1821
+ //! ``block_aggregate`` is the same value also returned by the scan operation.
1822
+ //! The functor will be invoked by the first warp of threads in the block,
1823
+ //! however only the return value from *lane*\ :sub:`0` is applied
1824
+ //! as the block-wide prefix. Can be stateful.
1825
+ //! - @blocked
1826
+ //! - @granularity
1827
+ //! - @smemreuse
1828
+ //!
1829
+ //! Snippet
1830
+ //! +++++++
1831
+ //!
1832
+ //! The code snippet below illustrates a single thread block that progressively
1833
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1834
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1835
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1836
+ //! across 128 threads where each thread owns 4 consecutive items.
1837
+ //!
1838
+ //! .. code-block:: c++
1839
+ //!
1840
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1841
+ //!
1842
+ //! // A stateful callback functor that maintains a running prefix to be applied
1843
+ //! // during consecutive scan operations.
1844
+ //! struct BlockPrefixCallbackOp
1845
+ //! {
1846
+ //! // Running prefix
1847
+ //! int running_total;
1848
+ //!
1849
+ //! // Constructor
1850
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1851
+ //!
1852
+ //! // Callback operator to be entered by the first warp of threads in the block.
1853
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1854
+ //! __host__ int operator()(int block_aggregate)
1855
+ //! {
1856
+ //! int old_prefix = running_total;
1857
+ //! running_total += block_aggregate;
1858
+ //! return old_prefix;
1859
+ //! }
1860
+ //! };
1861
+ //!
1862
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1863
+ //! {
1864
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1865
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1866
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1867
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1868
+ //!
1869
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1870
+ //! __shared__ union {
1871
+ //! typename BlockLoad::TempStorage load;
1872
+ //! typename BlockScan::TempStorage scan;
1873
+ //! typename BlockStore::TempStorage store;
1874
+ //! } temp_storage;
1875
+ //!
1876
+ //! // Initialize running total
1877
+ //! BlockPrefixCallbackOp prefix_op(0);
1878
+ //!
1879
+ //! // Have the block iterate over segments of items
1880
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1881
+ //! {
1882
+ //! // Load a segment of consecutive items that are blocked across threads
1883
+ //! int thread_data[4];
1884
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1885
+ //! __syncthreads();
1886
+ //!
1887
+ //! // Collectively compute the block-wide inclusive prefix sum
1888
+ //! BlockScan(temp_storage.scan).IncluisveSum(
1889
+ //! thread_data, thread_data, prefix_op);
1890
+ //! __syncthreads();
1891
+ //!
1892
+ //! // Store scanned items to output segment
1893
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1894
+ //! __syncthreads();
1895
+ //! }
1896
+ //!
1897
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1898
+ //! The corresponding output for the first segment will be
1899
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1900
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1901
+ //!
1902
+ //! @endrst
1903
+ //!
1904
+ //! @tparam ITEMS_PER_THREAD
1905
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1906
+ //!
1907
+ //! @tparam BlockPrefixCallbackOp
1908
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1909
+ //!
1910
+ //! @param[in] input
1911
+ //! Calling thread's input items
1912
+ //!
1913
+ //! @param[out] output
1914
+ //! Calling thread's output items (may be aliased to `input`)
1915
+ //!
1916
+ //! @param[in,out] block_prefix_callback_op
1917
+ //! @rst
1918
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1919
+ //! logical input sequence.
1920
+ //! @endrst
1921
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1922
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1923
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1924
+ {
1925
+ if (ITEMS_PER_THREAD == 1)
1926
+ {
1927
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1928
+ }
1929
+ else
1930
+ {
1931
+ // Reduce consecutive thread items in registers
1932
+ ::cuda::std::plus<> scan_op;
1933
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1934
+
1935
+ // Exclusive thread block-scan
1936
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1937
+
1938
+ // Inclusive scan in registers with prefix as seed
1939
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1940
+ }
1941
+ }
1942
+
1943
+ //! @} end member group
1944
+ //! @name Inclusive prefix scan operations
1945
+ //! @{
1946
+
1947
+ //! @rst
1948
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1949
+ //! Each thread contributes one input element.
1950
+ //!
1951
+ //! - Supports non-commutative scan operators.
1952
+ //! - @rowmajor
1953
+ //! - @smemreuse
1954
+ //!
1955
+ //! Snippet
1956
+ //! +++++++
1957
+ //!
1958
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1959
+ //! are partitioned across 128 threads.
1960
+ //!
1961
+ //! .. code-block:: c++
1962
+ //!
1963
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1964
+ //!
1965
+ //! __global__ void ExampleKernel(...)
1966
+ //! {
1967
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1968
+ //! using BlockScan = cub::BlockScan<int, 128>;
1969
+ //!
1970
+ //! // Allocate shared memory for BlockScan
1971
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1972
+ //!
1973
+ //! // Obtain input item for each thread
1974
+ //! int thread_data;
1975
+ //! ...
1976
+ //!
1977
+ //! // Collectively compute the block-wide inclusive prefix max scan
1978
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1979
+ //!
1980
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1981
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1982
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1983
+ //!
1984
+ //! @endrst
1985
+ //!
1986
+ //! @tparam ScanOp
1987
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1988
+ //!
1989
+ //! @param[in] input
1990
+ //! Calling thread's input item
1991
+ //!
1992
+ //! @param[out] output
1993
+ //! Calling thread's output item (may be aliased to `input`)
1994
+ //!
1995
+ //! @param[in] scan_op
1996
+ //! Binary scan functor
1997
+ template <typename ScanOp>
1998
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1999
+ {
2000
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
2001
+ }
2002
+
2003
+ //! @rst
2004
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2005
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
2006
+ //! ``block_aggregate`` of all inputs.
2007
+ //!
2008
+ //! - Supports non-commutative scan operators.
2009
+ //! - @rowmajor
2010
+ //! - @smemreuse
2011
+ //!
2012
+ //! Snippet
2013
+ //! +++++++
2014
+ //!
2015
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
2016
+ //! integer items that are partitioned across 128 threads.
2017
+ //!
2018
+ //! .. code-block:: c++
2019
+ //!
2020
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2021
+ //!
2022
+ //! __global__ void ExampleKernel(...)
2023
+ //! {
2024
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2025
+ //! using BlockScan = cub::BlockScan<int, 128>;
2026
+ //!
2027
+ //! // Allocate shared memory for BlockScan
2028
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2029
+ //!
2030
+ //! // Obtain input item for each thread
2031
+ //! int thread_data;
2032
+ //! ...
2033
+ //!
2034
+ //! // Collectively compute the block-wide inclusive prefix max scan
2035
+ //! int block_aggregate;
2036
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2037
+ //!
2038
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2039
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
2040
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
2041
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
2042
+ //!
2043
+ //! @endrst
2044
+ //!
2045
+ //! @tparam ScanOp
2046
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2047
+ //!
2048
+ //! @param[in] input
2049
+ //! Calling thread's input item
2050
+ //!
2051
+ //! @param[out] output
2052
+ //! Calling thread's output item (may be aliased to `input`)
2053
+ //!
2054
+ //! @param[in] scan_op
2055
+ //! Binary scan functor
2056
+ //!
2057
+ //! @param[out] block_aggregate
2058
+ //! Block-wide aggregate reduction of input items
2059
+ template <typename ScanOp>
2060
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
2061
+ {
2062
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
2063
+ }
2064
+
2065
+ //! @rst
2066
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2067
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
2068
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
2069
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
2070
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
2071
+ //!
2072
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
2073
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
2074
+ //! ``block_aggregate`` is the same value also returned by the scan operation.
2075
+ //! The functor will be invoked by the first warp of threads in the block,
2076
+ //! however only the return value from *lane*\ :sub:`0` is applied
2077
+ //! as the block-wide prefix. Can be stateful.
2078
+ //! - Supports non-commutative scan operators.
2079
+ //! - @rowmajor
2080
+ //! - @smemreuse
2081
+ //!
2082
+ //! Snippet
2083
+ //! +++++++
2084
+ //!
2085
+ //! The code snippet below illustrates a single thread block that progressively
2086
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2087
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2088
+ //! of 128 integer items that are partitioned across 128 threads.
2089
+ //!
2090
+ //! .. code-block:: c++
2091
+ //!
2092
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2093
+ //!
2094
+ //! // A stateful callback functor that maintains a running prefix to be applied
2095
+ //! // during consecutive scan operations.
2096
+ //! struct BlockPrefixCallbackOp
2097
+ //! {
2098
+ //! // Running prefix
2099
+ //! int running_total;
2100
+ //!
2101
+ //! // Constructor
2102
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2103
+ //!
2104
+ //! // Callback operator to be entered by the first warp of threads in the block.
2105
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2106
+ //! __host__ int operator()(int block_aggregate)
2107
+ //! {
2108
+ //! int old_prefix = running_total;
2109
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2110
+ //! return old_prefix;
2111
+ //! }
2112
+ //! };
2113
+ //!
2114
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2115
+ //! {
2116
+ //! // Specialize BlockScan for a 1D block of 128 threads
2117
+ //! using BlockScan = cub::BlockScan<int, 128>;
2118
+ //!
2119
+ //! // Allocate shared memory for BlockScan
2120
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2121
+ //!
2122
+ //! // Initialize running total
2123
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
2124
+ //!
2125
+ //! // Have the block iterate over segments of items
2126
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
2127
+ //! {
2128
+ //! // Load a segment of consecutive items that are blocked across threads
2129
+ //! int thread_data = d_data[block_offset];
2130
+ //!
2131
+ //! // Collectively compute the block-wide inclusive prefix max scan
2132
+ //! BlockScan(temp_storage).InclusiveScan(
2133
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2134
+ //! __syncthreads();
2135
+ //!
2136
+ //! // Store scanned items to output segment
2137
+ //! d_data[block_offset] = thread_data;
2138
+ //! }
2139
+ //!
2140
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2141
+ //! The corresponding output for the first segment will be
2142
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
2143
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
2144
+ //!
2145
+ //! @endrst
2146
+ //!
2147
+ //! @tparam ScanOp
2148
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2149
+ //!
2150
+ //! @tparam BlockPrefixCallbackOp
2151
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2152
+ //!
2153
+ //! @param[in] input
2154
+ //! Calling thread's input item
2155
+ //!
2156
+ //! @param[out] output
2157
+ //! Calling thread's output item (may be aliased to `input`)
2158
+ //!
2159
+ //! @param[in] scan_op
2160
+ //! Binary scan functor
2161
+ //!
2162
+ //! @param[in,out] block_prefix_callback_op
2163
+ //! @rst
2164
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2165
+ //! the logical input sequence.
2166
+ //! @endrst
2167
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
2168
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2169
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
2170
+ {
2171
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
2172
+ }
2173
+
2174
+ //! @} end member group
2175
+ //! @name Inclusive prefix scan operations (multiple data per thread)
2176
+ //! @{
2177
+
2178
+ //! @rst
2179
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2180
+ //! Each thread contributes an array of consecutive input elements.
2181
+ //!
2182
+ //! - Supports non-commutative scan operators.
2183
+ //! - @blocked
2184
+ //! - @granularity
2185
+ //! - @smemreuse
2186
+ //!
2187
+ //! Snippet
2188
+ //! +++++++
2189
+ //!
2190
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2191
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2192
+ //! where each thread owns 4 consecutive items.
2193
+ //!
2194
+ //! .. code-block:: c++
2195
+ //!
2196
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2197
+ //!
2198
+ //! __global__ void ExampleKernel(...)
2199
+ //! {
2200
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2201
+ //! using BlockScan = cub::BlockScan<int, 128>;
2202
+ //!
2203
+ //! // Allocate shared memory for BlockScan
2204
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2205
+ //!
2206
+ //! // Obtain a segment of consecutive items that are blocked across threads
2207
+ //! int thread_data[4];
2208
+ //! ...
2209
+ //!
2210
+ //! // Collectively compute the block-wide inclusive prefix max scan
2211
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
2212
+ //!
2213
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2214
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2215
+ //! The corresponding output ``thread_data`` in those threads will be
2216
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2217
+ //!
2218
+ //! @endrst
2219
+ //!
2220
+ //! @tparam ITEMS_PER_THREAD
2221
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2222
+ //!
2223
+ //! @tparam ScanOp
2224
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2225
+ //!
2226
+ //! @param[in] input
2227
+ //! Calling thread's input items
2228
+ //!
2229
+ //! @param[out] output
2230
+ //! Calling thread's output items (may be aliased to `input`)
2231
+ //!
2232
+ //! @param[in] scan_op
2233
+ //! Binary scan functor
2234
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2235
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2236
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
2237
+ {
2238
+ if (ITEMS_PER_THREAD == 1)
2239
+ {
2240
+ InclusiveScan(input[0], output[0], scan_op);
2241
+ }
2242
+ else
2243
+ {
2244
+ // Reduce consecutive thread items in registers
2245
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2246
+
2247
+ // Exclusive thread block-scan
2248
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
2249
+
2250
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2251
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2252
+ }
2253
+ }
2254
+
2255
+ //! @rst
2256
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2257
+ //! Each thread contributes an array of consecutive input elements.
2258
+ //!
2259
+ //! - Supports non-commutative scan operators.
2260
+ //! - @blocked
2261
+ //! - @granularity
2262
+ //! - @smemreuse
2263
+ //!
2264
+ //! Snippet
2265
+ //! +++++++
2266
+ //!
2267
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2268
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2269
+ //! where each thread owns 2 consecutive items.
2270
+ //!
2271
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2272
+ //! :language: c++
2273
+ //! :dedent:
2274
+ //! :start-after: example-begin inclusive-scan-array-init-value
2275
+ //! :end-before: example-end inclusive-scan-array-init-value
2276
+ //!
2277
+ //!
2278
+ //! @endrst
2279
+ //!
2280
+ //! @tparam ITEMS_PER_THREAD
2281
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2282
+ //!
2283
+ //! @tparam ScanOp
2284
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2285
+ //!
2286
+ //! @param[in] input
2287
+ //! Calling thread's input items
2288
+ //!
2289
+ //! @param[out] output
2290
+ //! Calling thread's output items (may be aliased to `input`)
2291
+ //!
2292
+ //! @param[in] initial_value
2293
+ //! Initial value to seed the inclusive scan (uniform across block)
2294
+ //!
2295
+ //! @param[in] scan_op
2296
+ //! Binary scan functor
2297
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2298
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2299
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
2300
+ {
2301
+ // Reduce consecutive thread items in registers
2302
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2303
+
2304
+ // Exclusive thread block-scan
2305
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
2306
+
2307
+ // Exclusive scan in registers with prefix as seed
2308
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2309
+ }
2310
+
2311
+ //! @rst
2312
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2313
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2314
+ //! with the block-wide ``block_aggregate`` of all inputs.
2315
+ //!
2316
+ //! - Supports non-commutative scan operators.
2317
+ //! - @blocked
2318
+ //! - @granularity
2319
+ //! - @smemreuse
2320
+ //!
2321
+ //! Snippet
2322
+ //! +++++++
2323
+ //!
2324
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2325
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2326
+ //! where each thread owns 4 consecutive items.
2327
+ //!
2328
+ //! .. code-block:: c++
2329
+ //!
2330
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2331
+ //!
2332
+ //! __global__ void ExampleKernel(...)
2333
+ //! {
2334
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2335
+ //! using BlockScan = cub::BlockScan<int, 128>;
2336
+ //!
2337
+ //! // Allocate shared memory for BlockScan
2338
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2339
+ //!
2340
+ //! // Obtain a segment of consecutive items that are blocked across threads
2341
+ //! int thread_data[4];
2342
+ //! ...
2343
+ //!
2344
+ //! // Collectively compute the block-wide inclusive prefix max scan
2345
+ //! int block_aggregate;
2346
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2347
+ //!
2348
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2349
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2350
+ //! The corresponding output ``thread_data`` in those threads will be
2351
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2352
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
2353
+ //!
2354
+ //! @endrst
2355
+ //!
2356
+ //! @tparam ITEMS_PER_THREAD
2357
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2358
+ //!
2359
+ //! @tparam ScanOp
2360
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2361
+ //!
2362
+ //! @param[in] input
2363
+ //! Calling thread's input items
2364
+ //!
2365
+ //! @param[out] output
2366
+ //! Calling thread's output items (may be aliased to `input`)
2367
+ //!
2368
+ //! @param[in] scan_op
2369
+ //! Binary scan functor
2370
+ //!
2371
+ //! @param[out] block_aggregate
2372
+ //! Block-wide aggregate reduction of input items
2373
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2374
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2375
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
2376
+ {
2377
+ if (ITEMS_PER_THREAD == 1)
2378
+ {
2379
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2380
+ }
2381
+ else
2382
+ {
2383
+ // Reduce consecutive thread items in registers
2384
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2385
+
2386
+ // Exclusive thread block-scan (with no initial value)
2387
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
2388
+
2389
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2390
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2391
+ }
2392
+ }
2393
+
2394
+ //! @rst
2395
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2396
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2397
+ //! with the block-wide ``block_aggregate`` of all inputs.
2398
+ //!
2399
+ //! - Supports non-commutative scan operators.
2400
+ //! - @blocked
2401
+ //! - @granularity
2402
+ //! - @smemreuse
2403
+ //!
2404
+ //! Snippet
2405
+ //! +++++++
2406
+ //!
2407
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2408
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2409
+ //! where each thread owns 2 consecutive items.
2410
+ //!
2411
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2412
+ //! :language: c++
2413
+ //! :dedent:
2414
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
2415
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
2416
+ //!
2417
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
2418
+ //!
2419
+ //! .. note::
2420
+ //!
2421
+ //! ``initial_value`` is not applied to the block-wide aggregate.
2422
+ //!
2423
+ //! @endrst
2424
+ //!
2425
+ //! @tparam ITEMS_PER_THREAD
2426
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2427
+ //!
2428
+ //! @tparam ScanOp
2429
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2430
+ //!
2431
+ //! @param[in] input
2432
+ //! Calling thread's input items
2433
+ //!
2434
+ //! @param[out] output
2435
+ //! Calling thread's output items (may be aliased to `input`)
2436
+ //!
2437
+ //! @param[in] initial_value
2438
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2439
+ //! into account for ``block_aggregate``.
2440
+ //!
2441
+ //! @param[in] scan_op
2442
+ //! Binary scan functor
2443
+ //!
2444
+ //! @param[out] block_aggregate
2445
+ //! Block-wide aggregate reduction of input items
2446
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2447
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2448
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2449
+ {
2450
+ // Reduce consecutive thread items in registers
2451
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2452
+
2453
+ // Exclusive thread block-scan
2454
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2455
+
2456
+ // Exclusive scan in registers with prefix as seed
2457
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2458
+ }
2459
+
2460
+ //! @rst
2461
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2462
+ //! Each thread contributes an array of consecutive input elements.
2463
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2464
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2465
+ //! thread block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
2466
+ //!
2467
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2468
+ //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation.
2469
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2470
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2471
+ //! - Supports non-commutative scan operators.
2472
+ //! - @blocked
2473
+ //! - @granularity
2474
+ //! - @smemreuse
2475
+ //!
2476
+ //! Snippet
2477
+ //! +++++++
2478
+ //!
2479
+ //! The code snippet below illustrates a single thread block that progressively
2480
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2481
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2482
+ //! of 128 integer items that are partitioned across 128 threads.
2483
+ //!
2484
+ //! .. code-block:: c++
2485
+ //!
2486
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2487
+ //!
2488
+ //! // A stateful callback functor that maintains a running prefix to be applied
2489
+ //! // during consecutive scan operations.
2490
+ //! struct BlockPrefixCallbackOp
2491
+ //! {
2492
+ //! // Running prefix
2493
+ //! int running_total;
2494
+ //!
2495
+ //! // Constructor
2496
+ //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2497
+ //!
2498
+ //! // Callback operator to be entered by the first warp of threads in the block.
2499
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2500
+ //! __host__ int operator()(int block_aggregate)
2501
+ //! {
2502
+ //! int old_prefix = running_total;
2503
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2504
+ //! return old_prefix;
2505
+ //! }
2506
+ //! };
2507
+ //!
2508
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2509
+ //! {
2510
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2511
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2512
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2513
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2514
+ //!
2515
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2516
+ //! __shared__ union {
2517
+ //! typename BlockLoad::TempStorage load;
2518
+ //! typename BlockScan::TempStorage scan;
2519
+ //! typename BlockStore::TempStorage store;
2520
+ //! } temp_storage;
2521
+ //!
2522
+ //! // Initialize running total
2523
+ //! BlockPrefixCallbackOp prefix_op(0);
2524
+ //!
2525
+ //! // Have the block iterate over segments of items
2526
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2527
+ //! {
2528
+ //! // Load a segment of consecutive items that are blocked across threads
2529
+ //! int thread_data[4];
2530
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2531
+ //! __syncthreads();
2532
+ //!
2533
+ //! // Collectively compute the block-wide inclusive prefix max scan
2534
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2535
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2536
+ //! __syncthreads();
2537
+ //!
2538
+ //! // Store scanned items to output segment
2539
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2540
+ //! __syncthreads();
2541
+ //! }
2542
+ //!
2543
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2544
+ //! The corresponding output for the first segment will be
2545
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2546
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2547
+ //!
2548
+ //! @endrst
2549
+ //!
2550
+ //! @tparam ITEMS_PER_THREAD
2551
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2552
+ //!
2553
+ //! @tparam ScanOp
2554
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2555
+ //!
2556
+ //! @tparam BlockPrefixCallbackOp
2557
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2558
+ //!
2559
+ //! @param[in] input
2560
+ //! Calling thread's input items
2561
+ //!
2562
+ //! @param[out] output
2563
+ //! Calling thread's output items (may be aliased to `input`)
2564
+ //!
2565
+ //! @param[in] scan_op
2566
+ //! Binary scan functor
2567
+ //!
2568
+ //! @param[in,out] block_prefix_callback_op
2569
+ //! @rst
2570
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2571
+ //! the logical input sequence.
2572
+ //! @endrst
2573
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2574
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2575
+ T (&input)[ITEMS_PER_THREAD],
2576
+ T (&output)[ITEMS_PER_THREAD],
2577
+ ScanOp scan_op,
2578
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2579
+ {
2580
+ if (ITEMS_PER_THREAD == 1)
2581
+ {
2582
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2583
+ }
2584
+ else
2585
+ {
2586
+ // Reduce consecutive thread items in registers
2587
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2588
+
2589
+ // Exclusive thread block-scan
2590
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2591
+
2592
+ // Inclusive scan in registers with prefix as seed
2593
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2594
+ }
2595
+ }
2596
+
2597
+ //! @} end member group
2598
+ };
2599
+
2600
+ CUB_NAMESPACE_END