cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2315 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
31
+ //! sum/scan of items partitioned across a CUDA thread block.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/block/specializations/block_scan_raking.cuh>
46
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
47
+ #include <cub/util_ptx.cuh>
48
+ #include <cub/util_type.cuh>
49
+
50
+ #include <cuda/std/__functional/operations.h>
51
+ #include <cuda/std/__type_traits/conditional.h>
52
+
53
+ CUB_NAMESPACE_BEGIN
54
+
55
+ /******************************************************************************
56
+ * Algorithmic variants
57
+ ******************************************************************************/
58
+
59
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
60
+ //! parallel prefix scan across a CUDA thread block.
61
+ enum BlockScanAlgorithm
62
+ {
63
+
64
+ //! @rst
65
+ //! Overview
66
+ //! ++++++++++++++++++++++++++
67
+ //!
68
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
69
+ //!
70
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
71
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
72
+ //! #. Upsweep sequential reduction in shared memory.
73
+ //! Threads within a single warp rake across segments of shared partial reductions.
74
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
75
+ //! #. Downsweep sequential exclusive scan in shared memory.
76
+ //! Threads within a single warp rake across segments of shared partial reductions,
77
+ //! seeded with the warp-scan output.
78
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
79
+ //! seeded with the raking scan output.
80
+ //!
81
+ //! Performance Considerations
82
+ //! ++++++++++++++++++++++++++
83
+ //!
84
+ //! - Although this variant may suffer longer turnaround latencies when the
85
+ //! GPU is under-occupied, it can often provide higher overall throughput
86
+ //! across the GPU when suitably occupied.
87
+ //!
88
+ //! @endrst
89
+ BLOCK_SCAN_RAKING,
90
+
91
+ //! @rst
92
+ //! Overview
93
+ //! ++++++++++++++++++++++++++
94
+ //!
95
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
96
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
97
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
98
+ //!
99
+ //! @endrst
100
+ BLOCK_SCAN_RAKING_MEMOIZE,
101
+
102
+ //! @rst
103
+ //! Overview
104
+ //! ++++++++++++++++++++++++++
105
+ //!
106
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
107
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
108
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
109
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
110
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
111
+ //! from each preceding warp.
112
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
113
+ //! seeded with the raking scan output.
114
+ //!
115
+ //! Performance Considerations
116
+ //! ++++++++++++++++++++++++++
117
+ //!
118
+ //! - Although this variant may suffer lower overall throughput across the
119
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
120
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
121
+ //!
122
+ //! @endrst
123
+ BLOCK_SCAN_WARP_SCANS,
124
+ };
125
+
126
+ //! @rst
127
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
128
+ //! sum/scan of items partitioned across a CUDA thread block.
129
+ //!
130
+ //! Overview
131
+ //! +++++++++++++++++++++++++++++++++++++++++++++
132
+ //!
133
+ //! - Given a list of input elements and a binary reduction operator, a
134
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
135
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
136
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
137
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
138
+ //! the *i*\ :sup:`th` output reduction.
139
+ //! - @rowmajor
140
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
141
+ //!
142
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
143
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
144
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
145
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
146
+ //! register pressure for intermediate storage.
147
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
148
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
149
+ //!
150
+ //! Performance Considerations
151
+ //! +++++++++++++++++++++++++++++++++++++++++++++
152
+ //!
153
+ //! - @granularity
154
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
155
+ //! - Uses synchronization-free communication between warp lanes when applicable
156
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
157
+ //! one or two depending on algorithm selection)
158
+ //! - Incurs zero bank conflicts for most types
159
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
160
+ //!
161
+ //! - Prefix sum variants (vs. generic scan)
162
+ //! - @blocksize
163
+ //!
164
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
165
+ //!
166
+ //! A Simple Example
167
+ //! +++++++++++++++++++++++++++++++++++++++++++++
168
+ //!
169
+ //! @blockcollective{BlockScan}
170
+ //!
171
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
172
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
173
+ //! where each thread owns 4 consecutive items.
174
+ //!
175
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
176
+ //! :language: c++
177
+ //! :dedent:
178
+ //! :start-after: example-begin exclusive-sum-array
179
+ //! :end-before: example-end exclusive-sum-array
180
+ //!
181
+ //! Suppose the set of input ``thread_data`` across the block of threads is
182
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
183
+ //! The corresponding output ``thread_data`` in those threads will be
184
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
185
+ //!
186
+ //! Re-using dynamically allocating shared memory
187
+ //! +++++++++++++++++++++++++++++++++++++++++++++
188
+ //!
189
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
190
+ //! BlockReduce and how to re-purpose the same memory region.
191
+ //! This example can be easily adapted to the storage required by BlockScan.
192
+ //!
193
+ //! @endrst
194
+ //!
195
+ //! @tparam T
196
+ //! Data type being scanned
197
+ //!
198
+ //! @tparam BLOCK_DIM_X
199
+ //! The thread block length in threads along the X dimension
200
+ //!
201
+ //! @tparam ALGORITHM
202
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
203
+ //! (default: cub::BLOCK_SCAN_RAKING)
204
+ //!
205
+ //! @tparam BLOCK_DIM_Y
206
+ //! **[optional]** The thread block length in threads along the Y dimension
207
+ //! (default: 1)
208
+ //!
209
+ //! @tparam BLOCK_DIM_Z
210
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
211
+ //!
212
+ template <typename T,
213
+ int BLOCK_DIM_X,
214
+ BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
215
+ int BLOCK_DIM_Y = 1,
216
+ int BLOCK_DIM_Z = 1>
217
+ class BlockScan
218
+ {
219
+ private:
220
+ /// Constants
221
+ enum
222
+ {
223
+ /// The thread block size in threads
224
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
225
+ };
226
+
227
+ /**
228
+ * Ensure the template parameterization meets the requirements of the
229
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
230
+ * cannot be used with thread block sizes not a multiple of the
231
+ * architectural warp size.
232
+ */
233
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
234
+ ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
235
+ ? BLOCK_SCAN_RAKING
236
+ : ALGORITHM;
237
+
238
+ using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
239
+ using Raking =
240
+ detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
241
+
242
+ /// Define the delegate type for the desired algorithm
243
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
244
+
245
+ /// Shared memory storage layout type for BlockScan
246
+ using _TempStorage = typename InternalBlockScan::TempStorage;
247
+
248
+ /// Shared storage reference
249
+ _TempStorage& temp_storage;
250
+
251
+ /// Linear thread-id
252
+ unsigned int linear_tid;
253
+
254
+ /// Internal storage allocator
255
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
256
+ {
257
+ __shared__ _TempStorage private_storage;
258
+ return private_storage;
259
+ }
260
+
261
+ public:
262
+ /// @smemstorage{BlockScan}
263
+ struct TempStorage : Uninitialized<_TempStorage>
264
+ {};
265
+
266
+ //! @name Collective constructors
267
+ //! @{
268
+
269
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
270
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
271
+ : temp_storage(PrivateStorage())
272
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
273
+ {}
274
+
275
+ /**
276
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
277
+ *
278
+ * @param[in] temp_storage
279
+ * Reference to memory allocation having layout type TempStorage
280
+ */
281
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
282
+ : temp_storage(temp_storage.Alias())
283
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
284
+ {}
285
+
286
+ //! @} end member group
287
+ //! @name Exclusive prefix sum operations
288
+ //! @{
289
+
290
+ //! @rst
291
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
292
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
293
+ //! to ``output`` in *thread*\ :sub:`0`.
294
+ //!
295
+ //! - @identityzero
296
+ //! - @rowmajor
297
+ //! - @smemreuse
298
+ //!
299
+ //! Snippet
300
+ //! +++++++
301
+ //!
302
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
303
+ //! are partitioned across 128 threads.
304
+ //!
305
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
306
+ //! :language: c++
307
+ //! :dedent:
308
+ //! :start-after: example-begin exclusive-sum-single
309
+ //! :end-before: example-end exclusive-sum-single
310
+ //!
311
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
312
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
313
+ //!
314
+ //! @endrst
315
+ //!
316
+ //! @param[in] input
317
+ //! Calling thread's input item
318
+ //!
319
+ //! @param[out] output
320
+ //! Calling thread's output item (may be aliased to `input`)
321
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
322
+ {
323
+ T initial_value{};
324
+
325
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
326
+ }
327
+
328
+ //! @rst
329
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
330
+ //! Each thread contributes one input element.
331
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
332
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
333
+ //!
334
+ //! - @identityzero
335
+ //! - @rowmajor
336
+ //! - @smemreuse
337
+ //!
338
+ //! Snippet
339
+ //! +++++++
340
+ //!
341
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
342
+ //! are partitioned across 128 threads.
343
+ //!
344
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
345
+ //! :language: c++
346
+ //! :dedent:
347
+ //! :start-after: example-begin exclusive-sum-aggregate
348
+ //! :end-before: example-end exclusive-sum-aggregate
349
+ //!
350
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
351
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
352
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
353
+ //!
354
+ //! @endrst
355
+ //!
356
+ //! @param[in] input
357
+ //! Calling thread's input item
358
+ //!
359
+ //! @param[out] output
360
+ //! Calling thread's output item (may be aliased to `input`)
361
+ //!
362
+ //! @param[out] block_aggregate
363
+ //! block-wide aggregate reduction of input items
364
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
365
+ {
366
+ T initial_value{};
367
+
368
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
369
+ }
370
+
371
+ //! @rst
372
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
373
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
374
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
375
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
376
+ //! scan inputs.
377
+ //!
378
+ //! - @identityzero
379
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
380
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
381
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
382
+ //! - @rowmajor
383
+ //! - @smemreuse
384
+ //!
385
+ //! Snippet
386
+ //! +++++++
387
+ //!
388
+ //! The code snippet below illustrates a single thread block that progressively
389
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
390
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
391
+ //! of 128 integer items that are partitioned across 128 threads.
392
+ //!
393
+ //! .. code-block:: c++
394
+ //!
395
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
396
+ //!
397
+ //! // A stateful callback functor that maintains a running prefix to be applied
398
+ //! // during consecutive scan operations.
399
+ //! struct BlockPrefixCallbackOp
400
+ //! {
401
+ //! // Running prefix
402
+ //! int running_total;
403
+ //!
404
+ //! // Constructor
405
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
406
+ //!
407
+ //! // Callback operator to be entered by the first warp of threads in the block.
408
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
409
+ //! __device__ int operator()(int block_aggregate)
410
+ //! {
411
+ //! int old_prefix = running_total;
412
+ //! running_total += block_aggregate;
413
+ //! return old_prefix;
414
+ //! }
415
+ //! };
416
+ //!
417
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
418
+ //! {
419
+ //! // Specialize BlockScan for a 1D block of 128 threads
420
+ //! using BlockScan = cub::BlockScan<int, 128>;
421
+ //!
422
+ //! // Allocate shared memory for BlockScan
423
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
424
+ //!
425
+ //! // Initialize running total
426
+ //! BlockPrefixCallbackOp prefix_op(0);
427
+ //!
428
+ //! // Have the block iterate over segments of items
429
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
430
+ //! {
431
+ //! // Load a segment of consecutive items that are blocked across threads
432
+ //! int thread_data = d_data[block_offset + threadIdx.x];
433
+ //!
434
+ //! // Collectively compute the block-wide exclusive prefix sum
435
+ //! BlockScan(temp_storage).ExclusiveSum(
436
+ //! thread_data, thread_data, prefix_op);
437
+ //! __syncthreads();
438
+ //!
439
+ //! // Store scanned items to output segment
440
+ //! d_data[block_offset + threadIdx.x] = thread_data;
441
+ //! }
442
+ //! }
443
+ //!
444
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
445
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
446
+ //! The output for the second segment will be ``128, 129, ..., 255``.
447
+ //!
448
+ //! @endrst
449
+ //!
450
+ //! @tparam BlockPrefixCallbackOp
451
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
452
+ //!
453
+ //! @param[in] input
454
+ //! Calling thread's input item
455
+ //!
456
+ //! @param[out] output
457
+ //! Calling thread's output item (may be aliased to `input`)
458
+ //!
459
+ //! @param[in,out] block_prefix_callback_op
460
+ //! @rst
461
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
462
+ //! the logical input sequence.
463
+ //! @endrst
464
+ template <typename BlockPrefixCallbackOp>
465
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
466
+ {
467
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
468
+ }
469
+
470
+ //! @} end member group
471
+ //! @name Exclusive prefix sum operations (multiple data per thread)
472
+ //! @{
473
+
474
+ //! @rst
475
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
476
+ //! Each thread contributes an array of consecutive input elements.
477
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
478
+ //!
479
+ //! - @identityzero
480
+ //! - @blocked
481
+ //! - @granularity
482
+ //! - @smemreuse
483
+ //!
484
+ //! Snippet
485
+ //! +++++++
486
+ //!
487
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
488
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
489
+ //! where each thread owns 4 consecutive items.
490
+ //!
491
+ //! .. code-block:: c++
492
+ //!
493
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
494
+ //!
495
+ //! __global__ void ExampleKernel(...)
496
+ //! {
497
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
498
+ //! using BlockScan = cub::BlockScan<int, 128>;
499
+ //!
500
+ //! // Allocate shared memory for BlockScan
501
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
502
+ //!
503
+ //! // Obtain a segment of consecutive items that are blocked across threads
504
+ //! int thread_data[4];
505
+ //! ...
506
+ //!
507
+ //! // Collectively compute the block-wide exclusive prefix sum
508
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
509
+ //! }
510
+ //!
511
+ //! Suppose the set of input ``thread_data`` across the block of threads is
512
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
513
+ //! The corresponding output ``thread_data`` in those threads will be
514
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
515
+ //!
516
+ //! @endrst
517
+ //!
518
+ //! @tparam ITEMS_PER_THREAD
519
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
520
+ //!
521
+ //! @param[in] input
522
+ //! Calling thread's input items
523
+ //!
524
+ //! @param[out] output
525
+ //! Calling thread's output items (may be aliased to `input`)
526
+ template <int ITEMS_PER_THREAD>
527
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
528
+ {
529
+ T initial_value{};
530
+
531
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
532
+ }
533
+
534
+ //! @rst
535
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
536
+ //! Each thread contributes an array of consecutive input elements.
537
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
538
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
539
+ //!
540
+ //! - @identityzero
541
+ //! - @blocked
542
+ //! - @granularity
543
+ //! - @smemreuse
544
+ //!
545
+ //! Snippet
546
+ //! +++++++
547
+ //!
548
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
549
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
550
+ //! 4 consecutive items.
551
+ //!
552
+ //! .. code-block:: c++
553
+ //!
554
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
555
+ //!
556
+ //! __global__ void ExampleKernel(...)
557
+ //! {
558
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
559
+ //! using BlockScan = cub::BlockScan<int, 128>;
560
+ //!
561
+ //! // Allocate shared memory for BlockScan
562
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
563
+ //!
564
+ //! // Obtain a segment of consecutive items that are blocked across threads
565
+ //! int thread_data[4];
566
+ //! ...
567
+ //!
568
+ //! // Collectively compute the block-wide exclusive prefix sum
569
+ //! int block_aggregate;
570
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
571
+ //! }
572
+ //!
573
+ //! Suppose the set of input ``thread_data`` across the block of threads is
574
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
575
+ //! The corresponding output ``thread_data`` in those threads will be
576
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
577
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
578
+ //!
579
+ //! @endrst
580
+ //!
581
+ //! @tparam ITEMS_PER_THREAD
582
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
583
+ //!
584
+ //! @param[in] input
585
+ //! Calling thread's input items
586
+ //!
587
+ //! @param[out] output
588
+ //! Calling thread's output items (may be aliased to `input`)
589
+ //!
590
+ //! @param[out] block_aggregate
591
+ //! block-wide aggregate reduction of input items
592
+ template <int ITEMS_PER_THREAD>
593
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
594
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
595
+ {
596
+ // Reduce consecutive thread items in registers
597
+ T initial_value{};
598
+
599
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
600
+ }
601
+
602
+ //! @rst
603
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
604
+ //! Each thread contributes an array of consecutive input elements.
605
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
606
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
607
+ //! value that logically prefixes the thread block's scan inputs.
608
+ //!
609
+ //! - @identityzero
610
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
611
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
612
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
613
+ //! - @blocked
614
+ //! - @granularity
615
+ //! - @smemreuse
616
+ //!
617
+ //!
618
+ //! Snippet
619
+ //! +++++++
620
+ //!
621
+ //! The code snippet below illustrates a single thread block that progressively
622
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
623
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
624
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
625
+ //! across 128 threads where each thread owns 4 consecutive items.
626
+ //!
627
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
628
+ //! :language: c++
629
+ //! :dedent:
630
+ //! :start-after: example-begin block-prefix-callback-op
631
+ //! :end-before: example-end block-prefix-callback-op
632
+ //!
633
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
634
+ //! :language: c++
635
+ //! :dedent:
636
+ //! :start-after: example-begin exclusive-sum-prefix-callback
637
+ //! :end-before: example-end exclusive-sum-prefix-callback
638
+ //!
639
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
640
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
641
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
642
+ //!
643
+ //! @endrst
644
+ //!
645
+ //! @tparam ITEMS_PER_THREAD
646
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
647
+ //!
648
+ //! @tparam BlockPrefixCallbackOp
649
+ //! **[inferred]** Call-back functor type having member
650
+ //! `T operator()(T block_aggregate)`
651
+ //!
652
+ //! @param[in] input
653
+ //! Calling thread's input items
654
+ //!
655
+ //! @param[out] output
656
+ //! Calling thread's output items (may be aliased to `input`)
657
+ //!
658
+ //! @param[in,out] block_prefix_callback_op
659
+ //! @rst
660
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
661
+ //! the logical input sequence.
662
+ //! @endrst
663
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
664
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
665
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
666
+ {
667
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
668
+ }
669
+
670
+ //! @} end member group // Exclusive prefix sums (multiple data per thread)
671
+ //! @name Exclusive prefix scan operations
672
+ //! @{
673
+
674
+ //! @rst
675
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
676
+ //! Each thread contributes one input element.
677
+ //!
678
+ //! - Supports non-commutative scan operators.
679
+ //! - @rowmajor
680
+ //! - @smemreuse
681
+ //!
682
+ //! Snippet
683
+ //! +++++++
684
+ //!
685
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
686
+ //! are partitioned across 128 threads.
687
+ //!
688
+ //! .. code-block:: c++
689
+ //!
690
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
691
+ //!
692
+ //! __global__ void ExampleKernel(...)
693
+ //! {
694
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
695
+ //! using BlockScan = cub::BlockScan<int, 128>;
696
+ //!
697
+ //! // Allocate shared memory for BlockScan
698
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
699
+ //!
700
+ //! // Obtain input item for each thread
701
+ //! int thread_data;
702
+ //! ...
703
+ //!
704
+ //! // Collectively compute the block-wide exclusive prefix max scan
705
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
706
+ //! }
707
+ //!
708
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
709
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
710
+ //!
711
+ //! @endrst
712
+ //!
713
+ //! @tparam ScanOp
714
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
715
+ //!
716
+ //! @param[in] input
717
+ //! Calling thread's input item
718
+ //!
719
+ //! @param[out] output
720
+ //! Calling thread's output item (may be aliased to `input`)
721
+ //!
722
+ //! @param[in] initial_value
723
+ //! @rst
724
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
725
+ //! @endrst
726
+ //!
727
+ //! @param[in] scan_op
728
+ //! Binary scan functor
729
+ template <typename ScanOp>
730
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
731
+ {
732
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
733
+ }
734
+
735
+ //! @rst
736
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
737
+ //! Each thread contributes one input element.
738
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
739
+ //!
740
+ //! - Supports non-commutative scan operators.
741
+ //! - @rowmajor
742
+ //! - @smemreuse
743
+ //!
744
+ //! Snippet
745
+ //! +++++++
746
+ //!
747
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
748
+ //! are partitioned across 128 threads.
749
+ //!
750
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
751
+ //! :language: c++
752
+ //! :dedent:
753
+ //! :start-after: example-begin exclusive-scan-aggregate
754
+ //! :end-before: example-end exclusive-scan-aggregate
755
+ //!
756
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
757
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
758
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
759
+ //!
760
+ //! .. note::
761
+ //!
762
+ //! ``initial_value`` is not applied to the block-wide aggregate.
763
+ //!
764
+ //! @endrst
765
+ //!
766
+ //! @tparam ScanOp
767
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
768
+ //!
769
+ //! @param[in] input
770
+ //! Calling thread's input items
771
+ //!
772
+ //! @param[out] output
773
+ //! Calling thread's output items (may be aliased to ``input``)
774
+ //!
775
+ //! @param[in] initial_value
776
+ //! @rst
777
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
778
+ //! taken into account for ``block_aggregate``.
779
+ //!
780
+ //! @endrst
781
+ //!
782
+ //! @param[in] scan_op
783
+ //! Binary scan functor
784
+ //!
785
+ //! @param[out] block_aggregate
786
+ //! block-wide aggregate reduction of input items
787
+ template <typename ScanOp>
788
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
789
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
790
+ {
791
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
792
+ }
793
+
794
+ //! @rst
795
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
796
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
797
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
798
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
799
+ //!
800
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
801
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
802
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
803
+ //! - Supports non-commutative scan operators.
804
+ //! - @rowmajor
805
+ //! - @smemreuse
806
+ //!
807
+ //! Snippet
808
+ //! +++++++
809
+ //!
810
+ //! The code snippet below illustrates a single thread block that progressively
811
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
812
+ //! prefix functor to maintain a running total between block-wide scans.
813
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
814
+ //!
815
+ //! .. code-block:: c++
816
+ //!
817
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
818
+ //!
819
+ //! // A stateful callback functor that maintains a running prefix to be applied
820
+ //! // during consecutive scan operations.
821
+ //! struct BlockPrefixCallbackOp
822
+ //! {
823
+ //! // Running prefix
824
+ //! int running_total;
825
+ //!
826
+ //! // Constructor
827
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
828
+ //!
829
+ //! // Callback operator to be entered by the first warp of threads in the block.
830
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
831
+ //! __device__ int operator()(int block_aggregate)
832
+ //! {
833
+ //! int old_prefix = running_total;
834
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
835
+ //! return old_prefix;
836
+ //! }
837
+ //! };
838
+ //!
839
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
840
+ //! {
841
+ //! // Specialize BlockScan for a 1D block of 128 threads
842
+ //! using BlockScan = cub::BlockScan<int, 128>;
843
+ //!
844
+ //! // Allocate shared memory for BlockScan
845
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
846
+ //!
847
+ //! // Initialize running total
848
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
849
+ //!
850
+ //! // Have the block iterate over segments of items
851
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
852
+ //! {
853
+ //! // Load a segment of consecutive items that are blocked across threads
854
+ //! int thread_data = d_data[block_offset + threadIdx.x];
855
+ //!
856
+ //! // Collectively compute the block-wide exclusive prefix max scan
857
+ //! BlockScan(temp_storage).ExclusiveScan(
858
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
859
+ //! __syncthreads();
860
+ //!
861
+ //! // Store scanned items to output segment
862
+ //! d_data[block_offset + threadIdx.x] = thread_data;
863
+ //! }
864
+ //! }
865
+ //!
866
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
867
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
868
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
869
+ //!
870
+ //! @endrst
871
+ //!
872
+ //! @tparam ScanOp
873
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
874
+ //!
875
+ //! @tparam BlockPrefixCallbackOp
876
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
877
+ //!
878
+ //! @param[in] input
879
+ //! Calling thread's input item
880
+ //!
881
+ //! @param[out] output
882
+ //! Calling thread's output item (may be aliased to `input`)
883
+ //!
884
+ //! @param[in] scan_op
885
+ //! Binary scan functor
886
+ //!
887
+ //! @param[in,out] block_prefix_callback_op
888
+ //! @rst
889
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
890
+ //! the logical input sequence.
891
+ //! @endrst
892
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
893
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
894
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
895
+ {
896
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
897
+ }
898
+
899
+ //! @} end member group // Inclusive prefix sums
900
+ //! @name Exclusive prefix scan operations (multiple data per thread)
901
+ //! @{
902
+
903
+ //! @rst
904
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
905
+ //! Each thread contributes an array of consecutive input elements.
906
+ //!
907
+ //! - Supports non-commutative scan operators.
908
+ //! - @blocked
909
+ //! - @granularity
910
+ //! - @smemreuse
911
+ //!
912
+ //! Snippet
913
+ //! +++++++
914
+ //!
915
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
916
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
917
+ //! across 128 threads where each thread owns 4 consecutive items.
918
+ //!
919
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
920
+ //! :language: c++
921
+ //! :dedent:
922
+ //! :start-after: example-begin exclusive-scan-array
923
+ //! :end-before: example-end exclusive-scan-array
924
+ //!
925
+ //! Suppose the set of input ``thread_data`` across the block of threads is
926
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
927
+ //! The corresponding output ``thread_data`` in those threads will be
928
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
929
+ //!
930
+ //! @endrst
931
+ //!
932
+ //! @tparam ITEMS_PER_THREAD
933
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
934
+ //!
935
+ //! @tparam ScanOp
936
+ //! **[inferred]** Binary scan functor type having member
937
+ //! `T operator()(const T &a, const T &b)`
938
+ //!
939
+ //! @param[in] input
940
+ //! Calling thread's input items
941
+ //!
942
+ //! @param[out] output
943
+ //! Calling thread's output items (may be aliased to `input`)
944
+ //!
945
+ //! @param[in] initial_value
946
+ //! @rst
947
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
948
+ //! @endrst
949
+ //!
950
+ //! @param[in] scan_op
951
+ //! Binary scan functor
952
+ template <int ITEMS_PER_THREAD, typename ScanOp>
953
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
954
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
955
+ {
956
+ // Reduce consecutive thread items in registers
957
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
958
+
959
+ // Exclusive thread block-scan
960
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
961
+
962
+ // Exclusive scan in registers with prefix as seed
963
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
964
+ }
965
+
966
+ //! @rst
967
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
968
+ //! Each thread contributes an array of consecutive input elements.
969
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
970
+ //!
971
+ //! - Supports non-commutative scan operators.
972
+ //! - @blocked
973
+ //! - @granularity
974
+ //! - @smemreuse
975
+ //!
976
+ //! Snippet
977
+ //! +++++++
978
+ //!
979
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
980
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
981
+ //! 4 consecutive items.
982
+ //!
983
+ //! .. code-block:: c++
984
+ //!
985
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
986
+ //!
987
+ //! __global__ void ExampleKernel(...)
988
+ //! {
989
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
990
+ //! using BlockScan = cub::BlockScan<int, 128>;
991
+ //!
992
+ //! // Allocate shared memory for BlockScan
993
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
994
+ //!
995
+ //! // Obtain a segment of consecutive items that are blocked across threads
996
+ //! int thread_data[4];
997
+ //! ...
998
+ //!
999
+ //! // Collectively compute the block-wide exclusive prefix max scan
1000
+ //! int block_aggregate;
1001
+ //! BlockScan(temp_storage).ExclusiveScan(
1002
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
1003
+ //!
1004
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1005
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1006
+ //! The corresponding output ``thread_data`` in those threads will be
1007
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1008
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
1009
+ //!
1010
+ //! .. note::
1011
+ //!
1012
+ //! ``initial_value`` is not applied to the block-wide aggregate.
1013
+ //!
1014
+ //! @endrst
1015
+ //!
1016
+ //! @tparam ITEMS_PER_THREAD
1017
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1018
+ //!
1019
+ //! @tparam ScanOp
1020
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1021
+ //!
1022
+ //! @param[in] input
1023
+ //! Calling thread's input items
1024
+ //!
1025
+ //! @param[out] output
1026
+ //! Calling thread's output items (may be aliased to `input`)
1027
+ //!
1028
+ //! @param[in] initial_value
1029
+ //! @rst
1030
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
1031
+ //! into account for ``block_aggregate``.
1032
+ //! @endrst
1033
+ //!
1034
+ //! @param[in] scan_op
1035
+ //! Binary scan functor
1036
+ //!
1037
+ //! @param[out] block_aggregate
1038
+ //! block-wide aggregate reduction of input items
1039
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1040
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1041
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
1042
+ {
1043
+ // Reduce consecutive thread items in registers
1044
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1045
+
1046
+ // Exclusive thread block-scan
1047
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1048
+
1049
+ // Exclusive scan in registers with prefix as seed
1050
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1051
+ }
1052
+
1053
+ //! @rst
1054
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1055
+ //! Each thread contributes an array of consecutive input elements.
1056
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
1057
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
1058
+ //! block's scan inputs.
1059
+ //!
1060
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1061
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the
1062
+ //! first warp of threads in the block, however only the return value from
1063
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1064
+ //! - Supports non-commutative scan operators.
1065
+ //! - @blocked
1066
+ //! - @granularity
1067
+ //! - @smemreuse
1068
+ //!
1069
+ //! Snippet
1070
+ //! +++++++
1071
+ //!
1072
+ //! The code snippet below illustrates a single thread block that progressively
1073
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
1074
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1075
+ //! of 128 integer items that are partitioned across 128 threads.
1076
+ //!
1077
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1078
+ //! :language: c++
1079
+ //! :dedent:
1080
+ //! :start-after: example-begin block-prefix-callback-max-op
1081
+ //! :end-before: example-end block-prefix-callback-max-op
1082
+ //!
1083
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1084
+ //! :language: c++
1085
+ //! :dedent:
1086
+ //! :start-after: example-begin exclusive-scan-prefix-callback
1087
+ //! :end-before: example-end exclusive-scan-prefix-callback
1088
+ //!
1089
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1090
+ //! The corresponding output for the first segment will be
1091
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
1092
+ //! The output for the second segment will be
1093
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1094
+ //!
1095
+ //! @endrst
1096
+ //!
1097
+ //! @tparam ITEMS_PER_THREAD
1098
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1099
+ //!
1100
+ //! @tparam ScanOp
1101
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1102
+ //!
1103
+ //! @tparam BlockPrefixCallbackOp
1104
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1105
+ //!
1106
+ //! @param[in] input
1107
+ //! Calling thread's input items
1108
+ //!
1109
+ //! @param[out] output
1110
+ //! Calling thread's output items (may be aliased to `input`)
1111
+ //!
1112
+ //! @param[in] scan_op
1113
+ //! Binary scan functor
1114
+ //!
1115
+ //! @param[in,out] block_prefix_callback_op
1116
+ //! @rst
1117
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1118
+ //! the logical input sequence.
1119
+ //! @endrst
1120
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1121
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1122
+ T (&input)[ITEMS_PER_THREAD],
1123
+ T (&output)[ITEMS_PER_THREAD],
1124
+ ScanOp scan_op,
1125
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1126
+ {
1127
+ // Reduce consecutive thread items in registers
1128
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1129
+
1130
+ // Exclusive thread block-scan
1131
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1132
+
1133
+ // Exclusive scan in registers with prefix as seed
1134
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1135
+ }
1136
+
1137
+ //! @} end member group
1138
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1139
+
1140
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1141
+ //! @{
1142
+
1143
+ //! @rst
1144
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1145
+ //! Each thread contributes one input element.
1146
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1147
+ //!
1148
+ //! - Supports non-commutative scan operators.
1149
+ //! - @rowmajor
1150
+ //! - @smemreuse
1151
+ //!
1152
+ //! @endrst
1153
+ //!
1154
+ //! @tparam ScanOp
1155
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1156
+ //!
1157
+ //! @param[in] input
1158
+ //! Calling thread's input item
1159
+ //!
1160
+ //! @param[out] output
1161
+ //! Calling thread's output item (may be aliased to `input`)
1162
+ //!
1163
+ //! @param[in] scan_op
1164
+ //! Binary scan functor
1165
+ template <typename ScanOp>
1166
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1167
+ {
1168
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1169
+ }
1170
+
1171
+ //! @rst
1172
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1173
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1174
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1175
+ //! *thread*\ :sub:`0` is undefined.
1176
+ //!
1177
+ //! - Supports non-commutative scan operators.
1178
+ //! - @rowmajor
1179
+ //! - @smemreuse
1180
+ //!
1181
+ //! @endrst
1182
+ //!
1183
+ //! @tparam ScanOp
1184
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1185
+ //!
1186
+ //! @param[in] input
1187
+ //! Calling thread's input item
1188
+ //!
1189
+ //! @param[out] output
1190
+ //! Calling thread's output item (may be aliased to `input`)
1191
+ //!
1192
+ //! @param[in] scan_op
1193
+ //! Binary scan functor
1194
+ //!
1195
+ //! @param[out] block_aggregate
1196
+ //! block-wide aggregate reduction of input items
1197
+ template <typename ScanOp>
1198
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1199
+ {
1200
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1201
+ }
1202
+
1203
+ //! @} end member group // Exclusive prefix scans (no initial value, single datum per thread)
1204
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1205
+ //! @{
1206
+
1207
+ //! @rst
1208
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1209
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1210
+ //! output computed for *thread*\ :sub:`0` is undefined.
1211
+ //!
1212
+ //! - Supports non-commutative scan operators.
1213
+ //! - @blocked
1214
+ //! - @granularity
1215
+ //! - @smemreuse
1216
+ //!
1217
+ //! @endrst
1218
+ //!
1219
+ //! @tparam ITEMS_PER_THREAD
1220
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1221
+ //!
1222
+ //! @tparam ScanOp
1223
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1224
+ //!
1225
+ //! @param[in] input
1226
+ //! Calling thread's input items
1227
+ //!
1228
+ //! @param[out] output
1229
+ //! Calling thread's output items (may be aliased to `input`)
1230
+ //!
1231
+ //! @param[in] scan_op
1232
+ //! Binary scan functor
1233
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1234
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1235
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1236
+ {
1237
+ // Reduce consecutive thread items in registers
1238
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1239
+
1240
+ // Exclusive thread block-scan
1241
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1242
+
1243
+ // Exclusive scan in registers with prefix
1244
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1245
+ }
1246
+
1247
+ //! @rst
1248
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1249
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1250
+ //! with the block-wide ``block_aggregate`` of all inputs.
1251
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1252
+ //!
1253
+ //! - Supports non-commutative scan operators.
1254
+ //! - @blocked
1255
+ //! - @granularity
1256
+ //! - @smemreuse
1257
+ //!
1258
+ //! @endrst
1259
+ //!
1260
+ //! @tparam ITEMS_PER_THREAD
1261
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1262
+ //!
1263
+ //! @tparam ScanOp
1264
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1265
+ //!
1266
+ //! @param[in] input
1267
+ //! Calling thread's input items
1268
+ //!
1269
+ //! @param[out] output
1270
+ //! Calling thread's output items (may be aliased to `input`)
1271
+ //!
1272
+ //! @param[in] scan_op
1273
+ //! Binary scan functor
1274
+ //!
1275
+ //! @param[out] block_aggregate
1276
+ //! block-wide aggregate reduction of input items
1277
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1278
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1279
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1280
+ {
1281
+ // Reduce consecutive thread items in registers
1282
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1283
+
1284
+ // Exclusive thread block-scan
1285
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1286
+
1287
+ // Exclusive scan in registers with prefix
1288
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1289
+ }
1290
+
1291
+ //! @} end member group // Exclusive prefix scans (no initial value, multiple data per thread)
1292
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1293
+
1294
+ //! @name Inclusive prefix sum operations
1295
+ //! @{
1296
+
1297
+ //! @rst
1298
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1299
+ //! as the scan operator. Each thread contributes one input element.
1300
+ //!
1301
+ //! - @rowmajor
1302
+ //! - @smemreuse
1303
+ //!
1304
+ //! Snippet
1305
+ //! +++++++
1306
+ //!
1307
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1308
+ //! are partitioned across 128 threads.
1309
+ //!
1310
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1311
+ //! :language: c++
1312
+ //! :dedent:
1313
+ //! :start-after: example-begin inclusive-sum-single
1314
+ //! :end-before: example-end inclusive-sum-single
1315
+ //!
1316
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1317
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1318
+ //!
1319
+ //! @endrst
1320
+ //!
1321
+ //! @param[in] input
1322
+ //! Calling thread's input item
1323
+ //!
1324
+ //! @param[out] output
1325
+ //! Calling thread's output item (may be aliased to `input`)
1326
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1327
+ {
1328
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1329
+ }
1330
+
1331
+ //! @rst
1332
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1333
+ //! Each thread contributes one input element.
1334
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1335
+ //!
1336
+ //! - @rowmajor
1337
+ //! - @smemreuse
1338
+ //!
1339
+ //! Snippet
1340
+ //! +++++++
1341
+ //!
1342
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1343
+ //! are partitioned across 128 threads.
1344
+ //!
1345
+ //! .. code-block:: c++
1346
+ //!
1347
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1348
+ //!
1349
+ //! __global__ void ExampleKernel(...)
1350
+ //! {
1351
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1352
+ //! using BlockScan = cub::BlockScan<int, 128>;
1353
+ //!
1354
+ //! // Allocate shared memory for BlockScan
1355
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1356
+ //!
1357
+ //! // Obtain input item for each thread
1358
+ //! int thread_data;
1359
+ //! ...
1360
+ //!
1361
+ //! // Collectively compute the block-wide inclusive prefix sum
1362
+ //! int block_aggregate;
1363
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1364
+ //!
1365
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1366
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1367
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1368
+ //!
1369
+ //! @endrst
1370
+ //!
1371
+ //! @param[in] input
1372
+ //! Calling thread's input item
1373
+ //!
1374
+ //! @param[out] output
1375
+ //! Calling thread's output item (may be aliased to `input`)
1376
+ //!
1377
+ //! @param[out] block_aggregate
1378
+ //! block-wide aggregate reduction of input items
1379
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1380
+ {
1381
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1382
+ }
1383
+
1384
+ //! @rst
1385
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1386
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1387
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1388
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1389
+ //! scan inputs.
1390
+ //!
1391
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1392
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1393
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1394
+ //! - @rowmajor
1395
+ //! - @smemreuse
1396
+ //!
1397
+ //! Snippet
1398
+ //! +++++++
1399
+ //!
1400
+ //! The code snippet below illustrates a single thread block that progressively
1401
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1402
+ //! prefix functor to maintain a running total between block-wide scans.
1403
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1404
+ //!
1405
+ //! .. code-block:: c++
1406
+ //!
1407
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1408
+ //!
1409
+ //! // A stateful callback functor that maintains a running prefix to be applied
1410
+ //! // during consecutive scan operations.
1411
+ //! struct BlockPrefixCallbackOp
1412
+ //! {
1413
+ //! // Running prefix
1414
+ //! int running_total;
1415
+ //!
1416
+ //! // Constructor
1417
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1418
+ //!
1419
+ //! // Callback operator to be entered by the first warp of threads in the block.
1420
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1421
+ //! __device__ int operator()(int block_aggregate)
1422
+ //! {
1423
+ //! int old_prefix = running_total;
1424
+ //! running_total += block_aggregate;
1425
+ //! return old_prefix;
1426
+ //! }
1427
+ //! };
1428
+ //!
1429
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1430
+ //! {
1431
+ //! // Specialize BlockScan for a 1D block of 128 threads
1432
+ //! using BlockScan = cub::BlockScan<int, 128>;
1433
+ //!
1434
+ //! // Allocate shared memory for BlockScan
1435
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1436
+ //!
1437
+ //! // Initialize running total
1438
+ //! BlockPrefixCallbackOp prefix_op(0);
1439
+ //!
1440
+ //! // Have the block iterate over segments of items
1441
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1442
+ //! {
1443
+ //! // Load a segment of consecutive items that are blocked across threads
1444
+ //! int thread_data = d_data[block_offset + threadIdx.x];
1445
+ //!
1446
+ //! // Collectively compute the block-wide inclusive prefix sum
1447
+ //! BlockScan(temp_storage).InclusiveSum(
1448
+ //! thread_data, thread_data, prefix_op);
1449
+ //! __syncthreads();
1450
+ //!
1451
+ //! // Store scanned items to output segment
1452
+ //! d_data[block_offset + threadIdx.x] = thread_data;
1453
+ //! }
1454
+ //!
1455
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1456
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1457
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1458
+ //!
1459
+ //! @endrst
1460
+ //!
1461
+ //! @tparam BlockPrefixCallbackOp
1462
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1463
+ //!
1464
+ //! @param[in] input
1465
+ //! Calling thread's input item
1466
+ //!
1467
+ //! @param[out] output
1468
+ //! Calling thread's output item (may be aliased to `input`)
1469
+ //!
1470
+ //! @param[in,out] block_prefix_callback_op
1471
+ //! @rst
1472
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1473
+ //! to the logical input sequence.
1474
+ //! @endrst
1475
+ template <typename BlockPrefixCallbackOp>
1476
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1477
+ {
1478
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1479
+ }
1480
+
1481
+ //! @} end member group
1482
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1483
+ //! @{
1484
+
1485
+ //! @rst
1486
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1487
+ //! Each thread contributes an array of consecutive input elements.
1488
+ //!
1489
+ //! - @blocked
1490
+ //! - @granularity
1491
+ //! - @smemreuse
1492
+ //!
1493
+ //! Snippet
1494
+ //! +++++++
1495
+ //!
1496
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1497
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1498
+ //! where each thread owns 4 consecutive items.
1499
+ //!
1500
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1501
+ //! :language: c++
1502
+ //! :dedent:
1503
+ //! :start-after: example-begin inclusive-sum-array
1504
+ //! :end-before: example-end inclusive-sum-array
1505
+ //!
1506
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1507
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1508
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1509
+ //!
1510
+ //! @endrst
1511
+ //!
1512
+ //! @tparam ITEMS_PER_THREAD
1513
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1514
+ //!
1515
+ //! @param[in] input
1516
+ //! Calling thread's input items
1517
+ //!
1518
+ //! @param[out] output
1519
+ //! Calling thread's output items (may be aliased to `input`)
1520
+ template <int ITEMS_PER_THREAD>
1521
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1522
+ {
1523
+ if (ITEMS_PER_THREAD == 1)
1524
+ {
1525
+ InclusiveSum(input[0], output[0]);
1526
+ }
1527
+ else
1528
+ {
1529
+ // Reduce consecutive thread items in registers
1530
+ ::cuda::std::plus<> scan_op;
1531
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1532
+
1533
+ // Exclusive thread block-scan
1534
+ ExclusiveSum(thread_prefix, thread_prefix);
1535
+
1536
+ // Inclusive scan in registers with prefix as seed
1537
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1538
+ }
1539
+ }
1540
+
1541
+ //! @rst
1542
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1543
+ //! Each thread contributes an array of consecutive input elements.
1544
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1545
+ //!
1546
+ //! - @blocked
1547
+ //! - @granularity
1548
+ //! - @smemreuse
1549
+ //!
1550
+ //! Snippet
1551
+ //! +++++++
1552
+ //!
1553
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1554
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1555
+ //! where each thread owns 4 consecutive items.
1556
+ //!
1557
+ //! .. code-block:: c++
1558
+ //!
1559
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1560
+ //!
1561
+ //! __global__ void ExampleKernel(...)
1562
+ //! {
1563
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1564
+ //! using BlockScan = cub::BlockScan<int, 128>;
1565
+ //!
1566
+ //! // Allocate shared memory for BlockScan
1567
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1568
+ //!
1569
+ //! // Obtain a segment of consecutive items that are blocked across threads
1570
+ //! int thread_data[4];
1571
+ //! ...
1572
+ //!
1573
+ //! // Collectively compute the block-wide inclusive prefix sum
1574
+ //! int block_aggregate;
1575
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1576
+ //!
1577
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1578
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1579
+ //! corresponding output ``thread_data`` in those threads will be
1580
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1581
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1582
+ //!
1583
+ //! @endrst
1584
+ //!
1585
+ //! @tparam ITEMS_PER_THREAD
1586
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1587
+ //!
1588
+ //! @param[in] input
1589
+ //! Calling thread's input items
1590
+ //!
1591
+ //! @param[out] output
1592
+ //! Calling thread's output items (may be aliased to `input`)
1593
+ //!
1594
+ //! @param[out] block_aggregate
1595
+ //! block-wide aggregate reduction of input items
1596
+ template <int ITEMS_PER_THREAD>
1597
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1598
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1599
+ {
1600
+ if (ITEMS_PER_THREAD == 1)
1601
+ {
1602
+ InclusiveSum(input[0], output[0], block_aggregate);
1603
+ }
1604
+ else
1605
+ {
1606
+ // Reduce consecutive thread items in registers
1607
+ ::cuda::std::plus<> scan_op;
1608
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1609
+
1610
+ // Exclusive thread block-scan
1611
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1612
+
1613
+ // Inclusive scan in registers with prefix as seed
1614
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1615
+ }
1616
+ }
1617
+
1618
+ //! @rst
1619
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1620
+ //! Each thread contributes an array of consecutive input elements.
1621
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1622
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1623
+ //! value that logically prefixes the thread block's scan inputs.
1624
+ //!
1625
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1626
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1627
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1628
+ //! - @blocked
1629
+ //! - @granularity
1630
+ //! - @smemreuse
1631
+ //!
1632
+ //! Snippet
1633
+ //! +++++++
1634
+ //!
1635
+ //! The code snippet below illustrates a single thread block that progressively
1636
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1637
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1638
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1639
+ //! across 128 threads where each thread owns 4 consecutive items.
1640
+ //!
1641
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1642
+ //! :language: c++
1643
+ //! :dedent:
1644
+ //! :start-after: example-begin block-prefix-callback-op
1645
+ //! :end-before: example-end block-prefix-callback-op
1646
+ //!
1647
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1648
+ //! :language: c++
1649
+ //! :dedent:
1650
+ //! :start-after: example-begin inclusive-scan-prefix-callback
1651
+ //! :end-before: example-end inclusive-scan-prefix-callback
1652
+ //!
1653
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1654
+ //! The corresponding output for the first segment will be
1655
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1656
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1657
+ //!
1658
+ //! @endrst
1659
+ //!
1660
+ //! @tparam ITEMS_PER_THREAD
1661
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1662
+ //!
1663
+ //! @tparam BlockPrefixCallbackOp
1664
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1665
+ //!
1666
+ //! @param[in] input
1667
+ //! Calling thread's input items
1668
+ //!
1669
+ //! @param[out] output
1670
+ //! Calling thread's output items (may be aliased to `input`)
1671
+ //!
1672
+ //! @param[in,out] block_prefix_callback_op
1673
+ //! @rst
1674
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1675
+ //! logical input sequence.
1676
+ //! @endrst
1677
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1678
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1679
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1680
+ {
1681
+ if (ITEMS_PER_THREAD == 1)
1682
+ {
1683
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1684
+ }
1685
+ else
1686
+ {
1687
+ // Reduce consecutive thread items in registers
1688
+ ::cuda::std::plus<> scan_op;
1689
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1690
+
1691
+ // Exclusive thread block-scan
1692
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1693
+
1694
+ // Inclusive scan in registers with prefix as seed
1695
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1696
+ }
1697
+ }
1698
+
1699
+ //! @} end member group
1700
+ //! @name Inclusive prefix scan operations
1701
+ //! @{
1702
+
1703
+ //! @rst
1704
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1705
+ //! Each thread contributes one input element.
1706
+ //!
1707
+ //! - Supports non-commutative scan operators.
1708
+ //! - @rowmajor
1709
+ //! - @smemreuse
1710
+ //!
1711
+ //! Snippet
1712
+ //! +++++++
1713
+ //!
1714
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1715
+ //! are partitioned across 128 threads.
1716
+ //!
1717
+ //! .. code-block:: c++
1718
+ //!
1719
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1720
+ //!
1721
+ //! __global__ void ExampleKernel(...)
1722
+ //! {
1723
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1724
+ //! using BlockScan = cub::BlockScan<int, 128>;
1725
+ //!
1726
+ //! // Allocate shared memory for BlockScan
1727
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1728
+ //!
1729
+ //! // Obtain input item for each thread
1730
+ //! int thread_data;
1731
+ //! ...
1732
+ //!
1733
+ //! // Collectively compute the block-wide inclusive prefix max scan
1734
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1735
+ //!
1736
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1737
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1738
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1739
+ //!
1740
+ //! @endrst
1741
+ //!
1742
+ //! @tparam ScanOp
1743
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1744
+ //!
1745
+ //! @param[in] input
1746
+ //! Calling thread's input item
1747
+ //!
1748
+ //! @param[out] output
1749
+ //! Calling thread's output item (may be aliased to `input`)
1750
+ //!
1751
+ //! @param[in] scan_op
1752
+ //! Binary scan functor
1753
+ template <typename ScanOp>
1754
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1755
+ {
1756
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1757
+ }
1758
+
1759
+ //! @rst
1760
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1761
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1762
+ //! ``block_aggregate`` of all inputs.
1763
+ //!
1764
+ //! - Supports non-commutative scan operators.
1765
+ //! - @rowmajor
1766
+ //! - @smemreuse
1767
+ //!
1768
+ //! Snippet
1769
+ //! +++++++
1770
+ //!
1771
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
1772
+ //! integer items that are partitioned across 128 threads.
1773
+ //!
1774
+ //! .. code-block:: c++
1775
+ //!
1776
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1777
+ //!
1778
+ //! __global__ void ExampleKernel(...)
1779
+ //! {
1780
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1781
+ //! using BlockScan = cub::BlockScan<int, 128>;
1782
+ //!
1783
+ //! // Allocate shared memory for BlockScan
1784
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1785
+ //!
1786
+ //! // Obtain input item for each thread
1787
+ //! int thread_data;
1788
+ //! ...
1789
+ //!
1790
+ //! // Collectively compute the block-wide inclusive prefix max scan
1791
+ //! int block_aggregate;
1792
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
1793
+ //!
1794
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1795
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1796
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
1797
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
1798
+ //!
1799
+ //! @endrst
1800
+ //!
1801
+ //! @tparam ScanOp
1802
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1803
+ //!
1804
+ //! @param[in] input
1805
+ //! Calling thread's input item
1806
+ //!
1807
+ //! @param[out] output
1808
+ //! Calling thread's output item (may be aliased to `input`)
1809
+ //!
1810
+ //! @param[in] scan_op
1811
+ //! Binary scan functor
1812
+ //!
1813
+ //! @param[out] block_aggregate
1814
+ //! Block-wide aggregate reduction of input items
1815
+ template <typename ScanOp>
1816
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1817
+ {
1818
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1819
+ }
1820
+
1821
+ //! @rst
1822
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1823
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
1824
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
1825
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
1826
+ //!
1827
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1828
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
1829
+ //! The functor will be invoked by the first warp of threads in the block,
1830
+ //! however only the return value from *lane*\ :sub:`0` is applied
1831
+ //! as the block-wide prefix. Can be stateful.
1832
+ //! - Supports non-commutative scan operators.
1833
+ //! - @rowmajor
1834
+ //! - @smemreuse
1835
+ //!
1836
+ //! Snippet
1837
+ //! +++++++
1838
+ //!
1839
+ //! The code snippet below illustrates a single thread block that progressively
1840
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
1841
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1842
+ //! of 128 integer items that are partitioned across 128 threads.
1843
+ //!
1844
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1845
+ //! :language: c++
1846
+ //! :dedent:
1847
+ //! :start-after: example-begin block-prefix-callback-max-op
1848
+ //! :end-before: example-end block-prefix-callback-max-op
1849
+ //!
1850
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1851
+ //! :language: c++
1852
+ //! :dedent:
1853
+ //! :start-after: example-begin inclusive-scan-prefix-callback-max
1854
+ //! :end-before: example-end inclusive-scan-prefix-callback-max
1855
+ //!
1856
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1857
+ //! The corresponding output for the first segment will be
1858
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
1859
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
1860
+ //!
1861
+ //! @endrst
1862
+ //!
1863
+ //! @tparam ScanOp
1864
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1865
+ //!
1866
+ //! @tparam BlockPrefixCallbackOp
1867
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1868
+ //!
1869
+ //! @param[in] input
1870
+ //! Calling thread's input item
1871
+ //!
1872
+ //! @param[out] output
1873
+ //! Calling thread's output item (may be aliased to `input`)
1874
+ //!
1875
+ //! @param[in] scan_op
1876
+ //! Binary scan functor
1877
+ //!
1878
+ //! @param[in,out] block_prefix_callback_op
1879
+ //! @rst
1880
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1881
+ //! the logical input sequence.
1882
+ //! @endrst
1883
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
1884
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1885
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
1886
+ {
1887
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
1888
+ }
1889
+
1890
+ //! @} end member group
1891
+ //! @name Inclusive prefix scan operations (multiple data per thread)
1892
+ //! @{
1893
+
1894
+ //! @rst
1895
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1896
+ //! Each thread contributes an array of consecutive input elements.
1897
+ //!
1898
+ //! - Supports non-commutative scan operators.
1899
+ //! - @blocked
1900
+ //! - @granularity
1901
+ //! - @smemreuse
1902
+ //!
1903
+ //! Snippet
1904
+ //! +++++++
1905
+ //!
1906
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
1907
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
1908
+ //! where each thread owns 4 consecutive items.
1909
+ //!
1910
+ //! .. code-block:: c++
1911
+ //!
1912
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1913
+ //!
1914
+ //! __global__ void ExampleKernel(...)
1915
+ //! {
1916
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1917
+ //! using BlockScan = cub::BlockScan<int, 128>;
1918
+ //!
1919
+ //! // Allocate shared memory for BlockScan
1920
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1921
+ //!
1922
+ //! // Obtain a segment of consecutive items that are blocked across threads
1923
+ //! int thread_data[4];
1924
+ //! ...
1925
+ //!
1926
+ //! // Collectively compute the block-wide inclusive prefix max scan
1927
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1928
+ //!
1929
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1930
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1931
+ //! The corresponding output ``thread_data`` in those threads will be
1932
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
1933
+ //!
1934
+ //! @endrst
1935
+ //!
1936
+ //! @tparam ITEMS_PER_THREAD
1937
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1938
+ //!
1939
+ //! @tparam ScanOp
1940
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1941
+ //!
1942
+ //! @param[in] input
1943
+ //! Calling thread's input items
1944
+ //!
1945
+ //! @param[out] output
1946
+ //! Calling thread's output items (may be aliased to `input`)
1947
+ //!
1948
+ //! @param[in] scan_op
1949
+ //! Binary scan functor
1950
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1951
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1952
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1953
+ {
1954
+ if (ITEMS_PER_THREAD == 1)
1955
+ {
1956
+ InclusiveScan(input[0], output[0], scan_op);
1957
+ }
1958
+ else
1959
+ {
1960
+ // Reduce consecutive thread items in registers
1961
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1962
+
1963
+ // Exclusive thread block-scan
1964
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
1965
+
1966
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
1967
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1968
+ }
1969
+ }
1970
+
1971
+ //! @rst
1972
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1973
+ //! Each thread contributes an array of consecutive input elements.
1974
+ //!
1975
+ //! - Supports non-commutative scan operators.
1976
+ //! - @blocked
1977
+ //! - @granularity
1978
+ //! - @smemreuse
1979
+ //!
1980
+ //! Snippet
1981
+ //! +++++++
1982
+ //!
1983
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1984
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
1985
+ //! where each thread owns 2 consecutive items.
1986
+ //!
1987
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
1988
+ //! :language: c++
1989
+ //! :dedent:
1990
+ //! :start-after: example-begin inclusive-scan-array-init-value
1991
+ //! :end-before: example-end inclusive-scan-array-init-value
1992
+ //!
1993
+ //!
1994
+ //! @endrst
1995
+ //!
1996
+ //! @tparam ITEMS_PER_THREAD
1997
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1998
+ //!
1999
+ //! @tparam ScanOp
2000
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2001
+ //!
2002
+ //! @param[in] input
2003
+ //! Calling thread's input items
2004
+ //!
2005
+ //! @param[out] output
2006
+ //! Calling thread's output items (may be aliased to `input`)
2007
+ //!
2008
+ //! @param[in] initial_value
2009
+ //! Initial value to seed the inclusive scan (uniform across block)
2010
+ //!
2011
+ //! @param[in] scan_op
2012
+ //! Binary scan functor
2013
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2014
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2015
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
2016
+ {
2017
+ // Reduce consecutive thread items in registers
2018
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2019
+
2020
+ // Exclusive thread block-scan
2021
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
2022
+
2023
+ // Exclusive scan in registers with prefix as seed
2024
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2025
+ }
2026
+
2027
+ //! @rst
2028
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2029
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2030
+ //! with the block-wide ``block_aggregate`` of all inputs.
2031
+ //!
2032
+ //! - Supports non-commutative scan operators.
2033
+ //! - @blocked
2034
+ //! - @granularity
2035
+ //! - @smemreuse
2036
+ //!
2037
+ //! Snippet
2038
+ //! +++++++
2039
+ //!
2040
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2041
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2042
+ //! where each thread owns 4 consecutive items.
2043
+ //!
2044
+ //! .. code-block:: c++
2045
+ //!
2046
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2047
+ //!
2048
+ //! __global__ void ExampleKernel(...)
2049
+ //! {
2050
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2051
+ //! using BlockScan = cub::BlockScan<int, 128>;
2052
+ //!
2053
+ //! // Allocate shared memory for BlockScan
2054
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2055
+ //!
2056
+ //! // Obtain a segment of consecutive items that are blocked across threads
2057
+ //! int thread_data[4];
2058
+ //! ...
2059
+ //!
2060
+ //! // Collectively compute the block-wide inclusive prefix max scan
2061
+ //! int block_aggregate;
2062
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2063
+ //!
2064
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2065
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2066
+ //! The corresponding output ``thread_data`` in those threads will be
2067
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2068
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
2069
+ //!
2070
+ //! @endrst
2071
+ //!
2072
+ //! @tparam ITEMS_PER_THREAD
2073
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2074
+ //!
2075
+ //! @tparam ScanOp
2076
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2077
+ //!
2078
+ //! @param[in] input
2079
+ //! Calling thread's input items
2080
+ //!
2081
+ //! @param[out] output
2082
+ //! Calling thread's output items (may be aliased to `input`)
2083
+ //!
2084
+ //! @param[in] scan_op
2085
+ //! Binary scan functor
2086
+ //!
2087
+ //! @param[out] block_aggregate
2088
+ //! Block-wide aggregate reduction of input items
2089
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2090
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2091
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
2092
+ {
2093
+ if (ITEMS_PER_THREAD == 1)
2094
+ {
2095
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2096
+ }
2097
+ else
2098
+ {
2099
+ // Reduce consecutive thread items in registers
2100
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2101
+
2102
+ // Exclusive thread block-scan (with no initial value)
2103
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
2104
+
2105
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2106
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2107
+ }
2108
+ }
2109
+
2110
+ //! @rst
2111
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2112
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2113
+ //! with the block-wide ``block_aggregate`` of all inputs.
2114
+ //!
2115
+ //! - Supports non-commutative scan operators.
2116
+ //! - @blocked
2117
+ //! - @granularity
2118
+ //! - @smemreuse
2119
+ //!
2120
+ //! Snippet
2121
+ //! +++++++
2122
+ //!
2123
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2124
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2125
+ //! where each thread owns 2 consecutive items.
2126
+ //!
2127
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2128
+ //! :language: c++
2129
+ //! :dedent:
2130
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
2131
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
2132
+ //!
2133
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
2134
+ //!
2135
+ //! .. note::
2136
+ //!
2137
+ //! ``initial_value`` is not applied to the block-wide aggregate.
2138
+ //!
2139
+ //! @endrst
2140
+ //!
2141
+ //! @tparam ITEMS_PER_THREAD
2142
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2143
+ //!
2144
+ //! @tparam ScanOp
2145
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2146
+ //!
2147
+ //! @param[in] input
2148
+ //! Calling thread's input items
2149
+ //!
2150
+ //! @param[out] output
2151
+ //! Calling thread's output items (may be aliased to `input`)
2152
+ //!
2153
+ //! @param[in] initial_value
2154
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2155
+ //! into account for ``block_aggregate``.
2156
+ //!
2157
+ //! @param[in] scan_op
2158
+ //! Binary scan functor
2159
+ //!
2160
+ //! @param[out] block_aggregate
2161
+ //! Block-wide aggregate reduction of input items
2162
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2163
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2164
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2165
+ {
2166
+ // Reduce consecutive thread items in registers
2167
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2168
+
2169
+ // Exclusive thread block-scan
2170
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2171
+
2172
+ // Exclusive scan in registers with prefix as seed
2173
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2174
+ }
2175
+
2176
+ //! @rst
2177
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2178
+ //! Each thread contributes an array of consecutive input elements.
2179
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2180
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2181
+ //! thread block's scan inputs.
2182
+ //!
2183
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2184
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2185
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2186
+ //! - Supports non-commutative scan operators.
2187
+ //! - @blocked
2188
+ //! - @granularity
2189
+ //! - @smemreuse
2190
+ //!
2191
+ //! Snippet
2192
+ //! +++++++
2193
+ //!
2194
+ //! The code snippet below illustrates a single thread block that progressively
2195
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2196
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2197
+ //! of 128 integer items that are partitioned across 128 threads.
2198
+ //!
2199
+ //! .. code-block:: c++
2200
+ //!
2201
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2202
+ //!
2203
+ //! // A stateful callback functor that maintains a running prefix to be applied
2204
+ //! // during consecutive scan operations.
2205
+ //! struct BlockPrefixCallbackOp
2206
+ //! {
2207
+ //! // Running prefix
2208
+ //! int running_total;
2209
+ //!
2210
+ //! // Constructor
2211
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2212
+ //!
2213
+ //! // Callback operator to be entered by the first warp of threads in the block.
2214
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2215
+ //! __device__ int operator()(int block_aggregate)
2216
+ //! {
2217
+ //! int old_prefix = running_total;
2218
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2219
+ //! return old_prefix;
2220
+ //! }
2221
+ //! };
2222
+ //!
2223
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2224
+ //! {
2225
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2226
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2227
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2228
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2229
+ //!
2230
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2231
+ //! __shared__ union {
2232
+ //! typename BlockLoad::TempStorage load;
2233
+ //! typename BlockScan::TempStorage scan;
2234
+ //! typename BlockStore::TempStorage store;
2235
+ //! } temp_storage;
2236
+ //!
2237
+ //! // Initialize running total
2238
+ //! BlockPrefixCallbackOp prefix_op(0);
2239
+ //!
2240
+ //! // Have the block iterate over segments of items
2241
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2242
+ //! {
2243
+ //! // Load a segment of consecutive items that are blocked across threads
2244
+ //! int thread_data[4];
2245
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2246
+ //! __syncthreads();
2247
+ //!
2248
+ //! // Collectively compute the block-wide inclusive prefix max scan
2249
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2250
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2251
+ //! __syncthreads();
2252
+ //!
2253
+ //! // Store scanned items to output segment
2254
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2255
+ //! __syncthreads();
2256
+ //! }
2257
+ //!
2258
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2259
+ //! The corresponding output for the first segment will be
2260
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2261
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2262
+ //!
2263
+ //! @endrst
2264
+ //!
2265
+ //! @tparam ITEMS_PER_THREAD
2266
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2267
+ //!
2268
+ //! @tparam ScanOp
2269
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2270
+ //!
2271
+ //! @tparam BlockPrefixCallbackOp
2272
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2273
+ //!
2274
+ //! @param[in] input
2275
+ //! Calling thread's input items
2276
+ //!
2277
+ //! @param[out] output
2278
+ //! Calling thread's output items (may be aliased to `input`)
2279
+ //!
2280
+ //! @param[in] scan_op
2281
+ //! Binary scan functor
2282
+ //!
2283
+ //! @param[in,out] block_prefix_callback_op
2284
+ //! @rst
2285
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2286
+ //! the logical input sequence.
2287
+ //! @endrst
2288
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2289
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2290
+ T (&input)[ITEMS_PER_THREAD],
2291
+ T (&output)[ITEMS_PER_THREAD],
2292
+ ScanOp scan_op,
2293
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2294
+ {
2295
+ if (ITEMS_PER_THREAD == 1)
2296
+ {
2297
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2298
+ }
2299
+ else
2300
+ {
2301
+ // Reduce consecutive thread items in registers
2302
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2303
+
2304
+ // Exclusive thread block-scan
2305
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2306
+
2307
+ // Inclusive scan in registers with prefix as seed
2308
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2309
+ }
2310
+ }
2311
+
2312
+ //! @} end member group
2313
+ };
2314
+
2315
+ CUB_NAMESPACE_END