cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2287 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
7
+ //! sum/scan of items partitioned across a CUDA thread block.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/block/specializations/block_scan_raking.cuh>
22
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
23
+ #include <cub/util_ptx.cuh>
24
+ #include <cub/util_type.cuh>
25
+
26
+ #include <cuda/std/__functional/operations.h>
27
+ #include <cuda/std/__type_traits/conditional.h>
28
+
29
+ CUB_NAMESPACE_BEGIN
30
+
31
+ /******************************************************************************
32
+ * Algorithmic variants
33
+ ******************************************************************************/
34
+
35
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
36
+ //! parallel prefix scan across a CUDA thread block.
37
+ enum BlockScanAlgorithm
38
+ {
39
+
40
+ //! @rst
41
+ //! Overview
42
+ //! ++++++++++++++++++++++++++
43
+ //!
44
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
45
+ //!
46
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
47
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
48
+ //! #. Upsweep sequential reduction in shared memory.
49
+ //! Threads within a single warp rake across segments of shared partial reductions.
50
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
51
+ //! #. Downsweep sequential exclusive scan in shared memory.
52
+ //! Threads within a single warp rake across segments of shared partial reductions,
53
+ //! seeded with the warp-scan output.
54
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
55
+ //! seeded with the raking scan output.
56
+ //!
57
+ //! Performance Considerations
58
+ //! ++++++++++++++++++++++++++
59
+ //!
60
+ //! - Although this variant may suffer longer turnaround latencies when the
61
+ //! GPU is under-occupied, it can often provide higher overall throughput
62
+ //! across the GPU when suitably occupied.
63
+ //!
64
+ //! @endrst
65
+ BLOCK_SCAN_RAKING,
66
+
67
+ //! @rst
68
+ //! Overview
69
+ //! ++++++++++++++++++++++++++
70
+ //!
71
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
72
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
73
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
74
+ //!
75
+ //! @endrst
76
+ BLOCK_SCAN_RAKING_MEMOIZE,
77
+
78
+ //! @rst
79
+ //! Overview
80
+ //! ++++++++++++++++++++++++++
81
+ //!
82
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
83
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
84
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
85
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
86
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
87
+ //! from each preceding warp.
88
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
89
+ //! seeded with the raking scan output.
90
+ //!
91
+ //! Performance Considerations
92
+ //! ++++++++++++++++++++++++++
93
+ //!
94
+ //! - Although this variant may suffer lower overall throughput across the
95
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
96
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
97
+ //!
98
+ //! @endrst
99
+ BLOCK_SCAN_WARP_SCANS,
100
+ };
101
+
102
+ //! @rst
103
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
104
+ //! sum/scan of items partitioned across a CUDA thread block.
105
+ //!
106
+ //! Overview
107
+ //! +++++++++++++++++++++++++++++++++++++++++++++
108
+ //!
109
+ //! - Given a list of input elements and a binary reduction operator, a
110
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
111
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
112
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
113
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
114
+ //! the *i*\ :sup:`th` output reduction.
115
+ //! - @rowmajor
116
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
117
+ //!
118
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
119
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
120
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
121
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
122
+ //! register pressure for intermediate storage.
123
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
124
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
125
+ //!
126
+ //! Performance Considerations
127
+ //! +++++++++++++++++++++++++++++++++++++++++++++
128
+ //!
129
+ //! - @granularity
130
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
131
+ //! - Uses synchronization-free communication between warp lanes when applicable
132
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
133
+ //! one or two depending on algorithm selection)
134
+ //! - Incurs zero bank conflicts for most types
135
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
136
+ //!
137
+ //! - Prefix sum variants (vs. generic scan)
138
+ //! - @blocksize
139
+ //!
140
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
141
+ //!
142
+ //! A Simple Example
143
+ //! +++++++++++++++++++++++++++++++++++++++++++++
144
+ //!
145
+ //! @blockcollective{BlockScan}
146
+ //!
147
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
148
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
149
+ //! where each thread owns 4 consecutive items.
150
+ //!
151
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
152
+ //! :language: c++
153
+ //! :dedent:
154
+ //! :start-after: example-begin exclusive-sum-array
155
+ //! :end-before: example-end exclusive-sum-array
156
+ //!
157
+ //! Suppose the set of input ``thread_data`` across the block of threads is
158
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
159
+ //! The corresponding output ``thread_data`` in those threads will be
160
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
161
+ //!
162
+ //! Re-using dynamically allocating shared memory
163
+ //! +++++++++++++++++++++++++++++++++++++++++++++
164
+ //!
165
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
166
+ //! BlockReduce and how to re-purpose the same memory region.
167
+ //! This example can be easily adapted to the storage required by BlockScan.
168
+ //!
169
+ //! @endrst
170
+ //!
171
+ //! @tparam T
172
+ //! Data type being scanned
173
+ //!
174
+ //! @tparam BlockDimX
175
+ //! The thread block length in threads along the X dimension
176
+ //!
177
+ //! @tparam Algorithm
178
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
179
+ //! (default: cub::BLOCK_SCAN_RAKING)
180
+ //!
181
+ //! @tparam BlockDimY
182
+ //! **[optional]** The thread block length in threads along the Y dimension
183
+ //! (default: 1)
184
+ //!
185
+ //! @tparam BlockDimZ
186
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
187
+ //!
188
+ template <typename T, int BlockDimX, BlockScanAlgorithm Algorithm = BLOCK_SCAN_RAKING, int BlockDimY = 1, int BlockDimZ = 1>
189
+ class BlockScan
190
+ {
191
+ private:
192
+ /// Constants
193
+ enum
194
+ {
195
+ /// The thread block size in threads
196
+ BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ,
197
+ };
198
+
199
+ /**
200
+ * Ensure the template parameterization meets the requirements of the
201
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
202
+ * cannot be used with thread block sizes not a multiple of the
203
+ * architectural warp size.
204
+ */
205
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
206
+ ((Algorithm == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
207
+ ? BLOCK_SCAN_RAKING
208
+ : Algorithm;
209
+
210
+ using WarpScans = detail::BlockScanWarpScans<T, BlockDimX, BlockDimY, BlockDimZ>;
211
+ using Raking =
212
+ detail::BlockScanRaking<T, BlockDimX, BlockDimY, BlockDimZ, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
213
+
214
+ /// Define the delegate type for the desired algorithm
215
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
216
+
217
+ /// Shared memory storage layout type for BlockScan
218
+ using _TempStorage = typename InternalBlockScan::TempStorage;
219
+
220
+ /// Shared storage reference
221
+ _TempStorage& temp_storage;
222
+
223
+ /// Linear thread-id
224
+ unsigned int linear_tid;
225
+
226
+ /// Internal storage allocator
227
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
228
+ {
229
+ __shared__ _TempStorage private_storage;
230
+ return private_storage;
231
+ }
232
+
233
+ public:
234
+ /// @smemstorage{BlockScan}
235
+ struct TempStorage : Uninitialized<_TempStorage>
236
+ {};
237
+
238
+ //! @name Collective constructors
239
+ //! @{
240
+
241
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
242
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
243
+ : temp_storage(PrivateStorage())
244
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
245
+ {}
246
+
247
+ /**
248
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
249
+ *
250
+ * @param[in] temp_storage
251
+ * Reference to memory allocation having layout type TempStorage
252
+ */
253
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
254
+ : temp_storage(temp_storage.Alias())
255
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
256
+ {}
257
+
258
+ //! @} end member group
259
+ //! @name Exclusive prefix sum operations
260
+ //! @{
261
+
262
+ //! @rst
263
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
264
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
265
+ //! to ``output`` in *thread*\ :sub:`0`.
266
+ //!
267
+ //! - @identityzero
268
+ //! - @rowmajor
269
+ //! - @smemreuse
270
+ //!
271
+ //! Snippet
272
+ //! +++++++
273
+ //!
274
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
275
+ //! are partitioned across 128 threads.
276
+ //!
277
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
278
+ //! :language: c++
279
+ //! :dedent:
280
+ //! :start-after: example-begin exclusive-sum-single
281
+ //! :end-before: example-end exclusive-sum-single
282
+ //!
283
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
284
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
285
+ //!
286
+ //! @endrst
287
+ //!
288
+ //! @param[in] input
289
+ //! Calling thread's input item
290
+ //!
291
+ //! @param[out] output
292
+ //! Calling thread's output item (may be aliased to `input`)
293
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
294
+ {
295
+ T initial_value{};
296
+
297
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
298
+ }
299
+
300
+ //! @rst
301
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
302
+ //! Each thread contributes one input element.
303
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
304
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
305
+ //!
306
+ //! - @identityzero
307
+ //! - @rowmajor
308
+ //! - @smemreuse
309
+ //!
310
+ //! Snippet
311
+ //! +++++++
312
+ //!
313
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
314
+ //! are partitioned across 128 threads.
315
+ //!
316
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
317
+ //! :language: c++
318
+ //! :dedent:
319
+ //! :start-after: example-begin exclusive-sum-aggregate
320
+ //! :end-before: example-end exclusive-sum-aggregate
321
+ //!
322
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
323
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
324
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
325
+ //!
326
+ //! @endrst
327
+ //!
328
+ //! @param[in] input
329
+ //! Calling thread's input item
330
+ //!
331
+ //! @param[out] output
332
+ //! Calling thread's output item (may be aliased to `input`)
333
+ //!
334
+ //! @param[out] block_aggregate
335
+ //! block-wide aggregate reduction of input items
336
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
337
+ {
338
+ T initial_value{};
339
+
340
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
341
+ }
342
+
343
+ //! @rst
344
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
345
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
346
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
347
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
348
+ //! scan inputs.
349
+ //!
350
+ //! - @identityzero
351
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
352
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
353
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
354
+ //! - @rowmajor
355
+ //! - @smemreuse
356
+ //!
357
+ //! Snippet
358
+ //! +++++++
359
+ //!
360
+ //! The code snippet below illustrates a single thread block that progressively
361
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
362
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
363
+ //! of 128 integer items that are partitioned across 128 threads.
364
+ //!
365
+ //! .. code-block:: c++
366
+ //!
367
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
368
+ //!
369
+ //! // A stateful callback functor that maintains a running prefix to be applied
370
+ //! // during consecutive scan operations.
371
+ //! struct BlockPrefixCallbackOp
372
+ //! {
373
+ //! // Running prefix
374
+ //! int running_total;
375
+ //!
376
+ //! // Constructor
377
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
378
+ //!
379
+ //! // Callback operator to be entered by the first warp of threads in the block.
380
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
381
+ //! __device__ int operator()(int block_aggregate)
382
+ //! {
383
+ //! int old_prefix = running_total;
384
+ //! running_total += block_aggregate;
385
+ //! return old_prefix;
386
+ //! }
387
+ //! };
388
+ //!
389
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
390
+ //! {
391
+ //! // Specialize BlockScan for a 1D block of 128 threads
392
+ //! using BlockScan = cub::BlockScan<int, 128>;
393
+ //!
394
+ //! // Allocate shared memory for BlockScan
395
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
396
+ //!
397
+ //! // Initialize running total
398
+ //! BlockPrefixCallbackOp prefix_op(0);
399
+ //!
400
+ //! // Have the block iterate over segments of items
401
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
402
+ //! {
403
+ //! // Load a segment of consecutive items that are blocked across threads
404
+ //! int thread_data = d_data[block_offset + threadIdx.x];
405
+ //!
406
+ //! // Collectively compute the block-wide exclusive prefix sum
407
+ //! BlockScan(temp_storage).ExclusiveSum(
408
+ //! thread_data, thread_data, prefix_op);
409
+ //! __syncthreads();
410
+ //!
411
+ //! // Store scanned items to output segment
412
+ //! d_data[block_offset + threadIdx.x] = thread_data;
413
+ //! }
414
+ //! }
415
+ //!
416
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
417
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
418
+ //! The output for the second segment will be ``128, 129, ..., 255``.
419
+ //!
420
+ //! @endrst
421
+ //!
422
+ //! @tparam BlockPrefixCallbackOp
423
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
424
+ //!
425
+ //! @param[in] input
426
+ //! Calling thread's input item
427
+ //!
428
+ //! @param[out] output
429
+ //! Calling thread's output item (may be aliased to `input`)
430
+ //!
431
+ //! @param[in,out] block_prefix_callback_op
432
+ //! @rst
433
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
434
+ //! the logical input sequence.
435
+ //! @endrst
436
+ template <typename BlockPrefixCallbackOp>
437
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
438
+ {
439
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
440
+ }
441
+
442
+ //! @} end member group
443
+ //! @name Exclusive prefix sum operations (multiple data per thread)
444
+ //! @{
445
+
446
+ //! @rst
447
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
448
+ //! Each thread contributes an array of consecutive input elements.
449
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
450
+ //!
451
+ //! - @identityzero
452
+ //! - @blocked
453
+ //! - @granularity
454
+ //! - @smemreuse
455
+ //!
456
+ //! Snippet
457
+ //! +++++++
458
+ //!
459
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
460
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
461
+ //! where each thread owns 4 consecutive items.
462
+ //!
463
+ //! .. code-block:: c++
464
+ //!
465
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
466
+ //!
467
+ //! __global__ void ExampleKernel(...)
468
+ //! {
469
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
470
+ //! using BlockScan = cub::BlockScan<int, 128>;
471
+ //!
472
+ //! // Allocate shared memory for BlockScan
473
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
474
+ //!
475
+ //! // Obtain a segment of consecutive items that are blocked across threads
476
+ //! int thread_data[4];
477
+ //! ...
478
+ //!
479
+ //! // Collectively compute the block-wide exclusive prefix sum
480
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
481
+ //! }
482
+ //!
483
+ //! Suppose the set of input ``thread_data`` across the block of threads is
484
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
485
+ //! The corresponding output ``thread_data`` in those threads will be
486
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
487
+ //!
488
+ //! @endrst
489
+ //!
490
+ //! @tparam ITEMS_PER_THREAD
491
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
492
+ //!
493
+ //! @param[in] input
494
+ //! Calling thread's input items
495
+ //!
496
+ //! @param[out] output
497
+ //! Calling thread's output items (may be aliased to `input`)
498
+ template <int ITEMS_PER_THREAD>
499
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
500
+ {
501
+ T initial_value{};
502
+
503
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
504
+ }
505
+
506
+ //! @rst
507
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
508
+ //! Each thread contributes an array of consecutive input elements.
509
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
510
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
511
+ //!
512
+ //! - @identityzero
513
+ //! - @blocked
514
+ //! - @granularity
515
+ //! - @smemreuse
516
+ //!
517
+ //! Snippet
518
+ //! +++++++
519
+ //!
520
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
521
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
522
+ //! 4 consecutive items.
523
+ //!
524
+ //! .. code-block:: c++
525
+ //!
526
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
527
+ //!
528
+ //! __global__ void ExampleKernel(...)
529
+ //! {
530
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
531
+ //! using BlockScan = cub::BlockScan<int, 128>;
532
+ //!
533
+ //! // Allocate shared memory for BlockScan
534
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
535
+ //!
536
+ //! // Obtain a segment of consecutive items that are blocked across threads
537
+ //! int thread_data[4];
538
+ //! ...
539
+ //!
540
+ //! // Collectively compute the block-wide exclusive prefix sum
541
+ //! int block_aggregate;
542
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
543
+ //! }
544
+ //!
545
+ //! Suppose the set of input ``thread_data`` across the block of threads is
546
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
547
+ //! The corresponding output ``thread_data`` in those threads will be
548
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
549
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
550
+ //!
551
+ //! @endrst
552
+ //!
553
+ //! @tparam ITEMS_PER_THREAD
554
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
555
+ //!
556
+ //! @param[in] input
557
+ //! Calling thread's input items
558
+ //!
559
+ //! @param[out] output
560
+ //! Calling thread's output items (may be aliased to `input`)
561
+ //!
562
+ //! @param[out] block_aggregate
563
+ //! block-wide aggregate reduction of input items
564
+ template <int ITEMS_PER_THREAD>
565
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
566
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
567
+ {
568
+ // Reduce consecutive thread items in registers
569
+ T initial_value{};
570
+
571
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
572
+ }
573
+
574
+ //! @rst
575
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
576
+ //! Each thread contributes an array of consecutive input elements.
577
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
578
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
579
+ //! value that logically prefixes the thread block's scan inputs.
580
+ //!
581
+ //! - @identityzero
582
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
583
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
584
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
585
+ //! - @blocked
586
+ //! - @granularity
587
+ //! - @smemreuse
588
+ //!
589
+ //!
590
+ //! Snippet
591
+ //! +++++++
592
+ //!
593
+ //! The code snippet below illustrates a single thread block that progressively
594
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
595
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
596
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
597
+ //! across 128 threads where each thread owns 4 consecutive items.
598
+ //!
599
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
600
+ //! :language: c++
601
+ //! :dedent:
602
+ //! :start-after: example-begin block-prefix-callback-op
603
+ //! :end-before: example-end block-prefix-callback-op
604
+ //!
605
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
606
+ //! :language: c++
607
+ //! :dedent:
608
+ //! :start-after: example-begin exclusive-sum-prefix-callback
609
+ //! :end-before: example-end exclusive-sum-prefix-callback
610
+ //!
611
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
612
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
613
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
614
+ //!
615
+ //! @endrst
616
+ //!
617
+ //! @tparam ITEMS_PER_THREAD
618
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
619
+ //!
620
+ //! @tparam BlockPrefixCallbackOp
621
+ //! **[inferred]** Call-back functor type having member
622
+ //! `T operator()(T block_aggregate)`
623
+ //!
624
+ //! @param[in] input
625
+ //! Calling thread's input items
626
+ //!
627
+ //! @param[out] output
628
+ //! Calling thread's output items (may be aliased to `input`)
629
+ //!
630
+ //! @param[in,out] block_prefix_callback_op
631
+ //! @rst
632
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
633
+ //! the logical input sequence.
634
+ //! @endrst
635
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
636
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
637
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
638
+ {
639
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
640
+ }
641
+
642
+ //! @} end member group // Exclusive prefix sums (multiple data per thread)
643
+ //! @name Exclusive prefix scan operations
644
+ //! @{
645
+
646
+ //! @rst
647
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
648
+ //! Each thread contributes one input element.
649
+ //!
650
+ //! - Supports non-commutative scan operators.
651
+ //! - @rowmajor
652
+ //! - @smemreuse
653
+ //!
654
+ //! Snippet
655
+ //! +++++++
656
+ //!
657
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
658
+ //! are partitioned across 128 threads.
659
+ //!
660
+ //! .. code-block:: c++
661
+ //!
662
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
663
+ //!
664
+ //! __global__ void ExampleKernel(...)
665
+ //! {
666
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
667
+ //! using BlockScan = cub::BlockScan<int, 128>;
668
+ //!
669
+ //! // Allocate shared memory for BlockScan
670
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
671
+ //!
672
+ //! // Obtain input item for each thread
673
+ //! int thread_data;
674
+ //! ...
675
+ //!
676
+ //! // Collectively compute the block-wide exclusive prefix max scan
677
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
678
+ //! }
679
+ //!
680
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
681
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
682
+ //!
683
+ //! @endrst
684
+ //!
685
+ //! @tparam ScanOp
686
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
687
+ //!
688
+ //! @param[in] input
689
+ //! Calling thread's input item
690
+ //!
691
+ //! @param[out] output
692
+ //! Calling thread's output item (may be aliased to `input`)
693
+ //!
694
+ //! @param[in] initial_value
695
+ //! @rst
696
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
697
+ //! @endrst
698
+ //!
699
+ //! @param[in] scan_op
700
+ //! Binary scan functor
701
+ template <typename ScanOp>
702
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
703
+ {
704
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
705
+ }
706
+
707
+ //! @rst
708
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
709
+ //! Each thread contributes one input element.
710
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
711
+ //!
712
+ //! - Supports non-commutative scan operators.
713
+ //! - @rowmajor
714
+ //! - @smemreuse
715
+ //!
716
+ //! Snippet
717
+ //! +++++++
718
+ //!
719
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
720
+ //! are partitioned across 128 threads.
721
+ //!
722
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
723
+ //! :language: c++
724
+ //! :dedent:
725
+ //! :start-after: example-begin exclusive-scan-aggregate
726
+ //! :end-before: example-end exclusive-scan-aggregate
727
+ //!
728
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
729
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
730
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
731
+ //!
732
+ //! .. note::
733
+ //!
734
+ //! ``initial_value`` is not applied to the block-wide aggregate.
735
+ //!
736
+ //! @endrst
737
+ //!
738
+ //! @tparam ScanOp
739
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
740
+ //!
741
+ //! @param[in] input
742
+ //! Calling thread's input items
743
+ //!
744
+ //! @param[out] output
745
+ //! Calling thread's output items (may be aliased to ``input``)
746
+ //!
747
+ //! @param[in] initial_value
748
+ //! @rst
749
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
750
+ //! taken into account for ``block_aggregate``.
751
+ //!
752
+ //! @endrst
753
+ //!
754
+ //! @param[in] scan_op
755
+ //! Binary scan functor
756
+ //!
757
+ //! @param[out] block_aggregate
758
+ //! block-wide aggregate reduction of input items
759
+ template <typename ScanOp>
760
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
761
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
762
+ {
763
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
764
+ }
765
+
766
+ //! @rst
767
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
768
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
769
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
770
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
771
+ //!
772
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
773
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
774
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
775
+ //! - Supports non-commutative scan operators.
776
+ //! - @rowmajor
777
+ //! - @smemreuse
778
+ //!
779
+ //! Snippet
780
+ //! +++++++
781
+ //!
782
+ //! The code snippet below illustrates a single thread block that progressively
783
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
784
+ //! prefix functor to maintain a running total between block-wide scans.
785
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
786
+ //!
787
+ //! .. code-block:: c++
788
+ //!
789
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
790
+ //!
791
+ //! // A stateful callback functor that maintains a running prefix to be applied
792
+ //! // during consecutive scan operations.
793
+ //! struct BlockPrefixCallbackOp
794
+ //! {
795
+ //! // Running prefix
796
+ //! int running_total;
797
+ //!
798
+ //! // Constructor
799
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
800
+ //!
801
+ //! // Callback operator to be entered by the first warp of threads in the block.
802
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
803
+ //! __device__ int operator()(int block_aggregate)
804
+ //! {
805
+ //! int old_prefix = running_total;
806
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
807
+ //! return old_prefix;
808
+ //! }
809
+ //! };
810
+ //!
811
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
812
+ //! {
813
+ //! // Specialize BlockScan for a 1D block of 128 threads
814
+ //! using BlockScan = cub::BlockScan<int, 128>;
815
+ //!
816
+ //! // Allocate shared memory for BlockScan
817
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
818
+ //!
819
+ //! // Initialize running total
820
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
821
+ //!
822
+ //! // Have the block iterate over segments of items
823
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
824
+ //! {
825
+ //! // Load a segment of consecutive items that are blocked across threads
826
+ //! int thread_data = d_data[block_offset + threadIdx.x];
827
+ //!
828
+ //! // Collectively compute the block-wide exclusive prefix max scan
829
+ //! BlockScan(temp_storage).ExclusiveScan(
830
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
831
+ //! __syncthreads();
832
+ //!
833
+ //! // Store scanned items to output segment
834
+ //! d_data[block_offset + threadIdx.x] = thread_data;
835
+ //! }
836
+ //! }
837
+ //!
838
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
839
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
840
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
841
+ //!
842
+ //! @endrst
843
+ //!
844
+ //! @tparam ScanOp
845
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
846
+ //!
847
+ //! @tparam BlockPrefixCallbackOp
848
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
849
+ //!
850
+ //! @param[in] input
851
+ //! Calling thread's input item
852
+ //!
853
+ //! @param[out] output
854
+ //! Calling thread's output item (may be aliased to `input`)
855
+ //!
856
+ //! @param[in] scan_op
857
+ //! Binary scan functor
858
+ //!
859
+ //! @param[in,out] block_prefix_callback_op
860
+ //! @rst
861
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
862
+ //! the logical input sequence.
863
+ //! @endrst
864
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
865
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
866
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
867
+ {
868
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
869
+ }
870
+
871
+ //! @} end member group // Inclusive prefix sums
872
+ //! @name Exclusive prefix scan operations (multiple data per thread)
873
+ //! @{
874
+
875
+ //! @rst
876
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
877
+ //! Each thread contributes an array of consecutive input elements.
878
+ //!
879
+ //! - Supports non-commutative scan operators.
880
+ //! - @blocked
881
+ //! - @granularity
882
+ //! - @smemreuse
883
+ //!
884
+ //! Snippet
885
+ //! +++++++
886
+ //!
887
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
888
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
889
+ //! across 128 threads where each thread owns 4 consecutive items.
890
+ //!
891
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
892
+ //! :language: c++
893
+ //! :dedent:
894
+ //! :start-after: example-begin exclusive-scan-array
895
+ //! :end-before: example-end exclusive-scan-array
896
+ //!
897
+ //! Suppose the set of input ``thread_data`` across the block of threads is
898
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
899
+ //! The corresponding output ``thread_data`` in those threads will be
900
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
901
+ //!
902
+ //! @endrst
903
+ //!
904
+ //! @tparam ITEMS_PER_THREAD
905
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
906
+ //!
907
+ //! @tparam ScanOp
908
+ //! **[inferred]** Binary scan functor type having member
909
+ //! `T operator()(const T &a, const T &b)`
910
+ //!
911
+ //! @param[in] input
912
+ //! Calling thread's input items
913
+ //!
914
+ //! @param[out] output
915
+ //! Calling thread's output items (may be aliased to `input`)
916
+ //!
917
+ //! @param[in] initial_value
918
+ //! @rst
919
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
920
+ //! @endrst
921
+ //!
922
+ //! @param[in] scan_op
923
+ //! Binary scan functor
924
+ template <int ITEMS_PER_THREAD, typename ScanOp>
925
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
926
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
927
+ {
928
+ // Reduce consecutive thread items in registers
929
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
930
+
931
+ // Exclusive thread block-scan
932
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
933
+
934
+ // Exclusive scan in registers with prefix as seed
935
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
936
+ }
937
+
938
+ //! @rst
939
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
940
+ //! Each thread contributes an array of consecutive input elements.
941
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
942
+ //!
943
+ //! - Supports non-commutative scan operators.
944
+ //! - @blocked
945
+ //! - @granularity
946
+ //! - @smemreuse
947
+ //!
948
+ //! Snippet
949
+ //! +++++++
950
+ //!
951
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
952
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
953
+ //! 4 consecutive items.
954
+ //!
955
+ //! .. code-block:: c++
956
+ //!
957
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
958
+ //!
959
+ //! __global__ void ExampleKernel(...)
960
+ //! {
961
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
962
+ //! using BlockScan = cub::BlockScan<int, 128>;
963
+ //!
964
+ //! // Allocate shared memory for BlockScan
965
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
966
+ //!
967
+ //! // Obtain a segment of consecutive items that are blocked across threads
968
+ //! int thread_data[4];
969
+ //! ...
970
+ //!
971
+ //! // Collectively compute the block-wide exclusive prefix max scan
972
+ //! int block_aggregate;
973
+ //! BlockScan(temp_storage).ExclusiveScan(
974
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
975
+ //!
976
+ //! Suppose the set of input ``thread_data`` across the block of threads is
977
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
978
+ //! The corresponding output ``thread_data`` in those threads will be
979
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
980
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
981
+ //!
982
+ //! .. note::
983
+ //!
984
+ //! ``initial_value`` is not applied to the block-wide aggregate.
985
+ //!
986
+ //! @endrst
987
+ //!
988
+ //! @tparam ITEMS_PER_THREAD
989
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
990
+ //!
991
+ //! @tparam ScanOp
992
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
993
+ //!
994
+ //! @param[in] input
995
+ //! Calling thread's input items
996
+ //!
997
+ //! @param[out] output
998
+ //! Calling thread's output items (may be aliased to `input`)
999
+ //!
1000
+ //! @param[in] initial_value
1001
+ //! @rst
1002
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
1003
+ //! into account for ``block_aggregate``.
1004
+ //! @endrst
1005
+ //!
1006
+ //! @param[in] scan_op
1007
+ //! Binary scan functor
1008
+ //!
1009
+ //! @param[out] block_aggregate
1010
+ //! block-wide aggregate reduction of input items
1011
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1012
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1013
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
1014
+ {
1015
+ // Reduce consecutive thread items in registers
1016
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1017
+
1018
+ // Exclusive thread block-scan
1019
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1020
+
1021
+ // Exclusive scan in registers with prefix as seed
1022
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1023
+ }
1024
+
1025
+ //! @rst
1026
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1027
+ //! Each thread contributes an array of consecutive input elements.
1028
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
1029
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
1030
+ //! block's scan inputs.
1031
+ //!
1032
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1033
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the
1034
+ //! first warp of threads in the block, however only the return value from
1035
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1036
+ //! - Supports non-commutative scan operators.
1037
+ //! - @blocked
1038
+ //! - @granularity
1039
+ //! - @smemreuse
1040
+ //!
1041
+ //! Snippet
1042
+ //! +++++++
1043
+ //!
1044
+ //! The code snippet below illustrates a single thread block that progressively
1045
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
1046
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1047
+ //! of 128 integer items that are partitioned across 128 threads.
1048
+ //!
1049
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1050
+ //! :language: c++
1051
+ //! :dedent:
1052
+ //! :start-after: example-begin block-prefix-callback-max-op
1053
+ //! :end-before: example-end block-prefix-callback-max-op
1054
+ //!
1055
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1056
+ //! :language: c++
1057
+ //! :dedent:
1058
+ //! :start-after: example-begin exclusive-scan-prefix-callback
1059
+ //! :end-before: example-end exclusive-scan-prefix-callback
1060
+ //!
1061
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1062
+ //! The corresponding output for the first segment will be
1063
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
1064
+ //! The output for the second segment will be
1065
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1066
+ //!
1067
+ //! @endrst
1068
+ //!
1069
+ //! @tparam ITEMS_PER_THREAD
1070
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1071
+ //!
1072
+ //! @tparam ScanOp
1073
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1074
+ //!
1075
+ //! @tparam BlockPrefixCallbackOp
1076
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1077
+ //!
1078
+ //! @param[in] input
1079
+ //! Calling thread's input items
1080
+ //!
1081
+ //! @param[out] output
1082
+ //! Calling thread's output items (may be aliased to `input`)
1083
+ //!
1084
+ //! @param[in] scan_op
1085
+ //! Binary scan functor
1086
+ //!
1087
+ //! @param[in,out] block_prefix_callback_op
1088
+ //! @rst
1089
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1090
+ //! the logical input sequence.
1091
+ //! @endrst
1092
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1093
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1094
+ T (&input)[ITEMS_PER_THREAD],
1095
+ T (&output)[ITEMS_PER_THREAD],
1096
+ ScanOp scan_op,
1097
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1098
+ {
1099
+ // Reduce consecutive thread items in registers
1100
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1101
+
1102
+ // Exclusive thread block-scan
1103
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1104
+
1105
+ // Exclusive scan in registers with prefix as seed
1106
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1107
+ }
1108
+
1109
+ //! @} end member group
1110
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1111
+
1112
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1113
+ //! @{
1114
+
1115
+ //! @rst
1116
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1117
+ //! Each thread contributes one input element.
1118
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1119
+ //!
1120
+ //! - Supports non-commutative scan operators.
1121
+ //! - @rowmajor
1122
+ //! - @smemreuse
1123
+ //!
1124
+ //! @endrst
1125
+ //!
1126
+ //! @tparam ScanOp
1127
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1128
+ //!
1129
+ //! @param[in] input
1130
+ //! Calling thread's input item
1131
+ //!
1132
+ //! @param[out] output
1133
+ //! Calling thread's output item (may be aliased to `input`)
1134
+ //!
1135
+ //! @param[in] scan_op
1136
+ //! Binary scan functor
1137
+ template <typename ScanOp>
1138
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1139
+ {
1140
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1141
+ }
1142
+
1143
+ //! @rst
1144
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1145
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1146
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1147
+ //! *thread*\ :sub:`0` is undefined.
1148
+ //!
1149
+ //! - Supports non-commutative scan operators.
1150
+ //! - @rowmajor
1151
+ //! - @smemreuse
1152
+ //!
1153
+ //! @endrst
1154
+ //!
1155
+ //! @tparam ScanOp
1156
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1157
+ //!
1158
+ //! @param[in] input
1159
+ //! Calling thread's input item
1160
+ //!
1161
+ //! @param[out] output
1162
+ //! Calling thread's output item (may be aliased to `input`)
1163
+ //!
1164
+ //! @param[in] scan_op
1165
+ //! Binary scan functor
1166
+ //!
1167
+ //! @param[out] block_aggregate
1168
+ //! block-wide aggregate reduction of input items
1169
+ template <typename ScanOp>
1170
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1171
+ {
1172
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1173
+ }
1174
+
1175
+ //! @} end member group // Exclusive prefix scans (no initial value, single datum per thread)
1176
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1177
+ //! @{
1178
+
1179
+ //! @rst
1180
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1181
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1182
+ //! output computed for *thread*\ :sub:`0` is undefined.
1183
+ //!
1184
+ //! - Supports non-commutative scan operators.
1185
+ //! - @blocked
1186
+ //! - @granularity
1187
+ //! - @smemreuse
1188
+ //!
1189
+ //! @endrst
1190
+ //!
1191
+ //! @tparam ITEMS_PER_THREAD
1192
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1193
+ //!
1194
+ //! @tparam ScanOp
1195
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1196
+ //!
1197
+ //! @param[in] input
1198
+ //! Calling thread's input items
1199
+ //!
1200
+ //! @param[out] output
1201
+ //! Calling thread's output items (may be aliased to `input`)
1202
+ //!
1203
+ //! @param[in] scan_op
1204
+ //! Binary scan functor
1205
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1206
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1207
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1208
+ {
1209
+ // Reduce consecutive thread items in registers
1210
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1211
+
1212
+ // Exclusive thread block-scan
1213
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1214
+
1215
+ // Exclusive scan in registers with prefix
1216
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1217
+ }
1218
+
1219
+ //! @rst
1220
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1221
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1222
+ //! with the block-wide ``block_aggregate`` of all inputs.
1223
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1224
+ //!
1225
+ //! - Supports non-commutative scan operators.
1226
+ //! - @blocked
1227
+ //! - @granularity
1228
+ //! - @smemreuse
1229
+ //!
1230
+ //! @endrst
1231
+ //!
1232
+ //! @tparam ITEMS_PER_THREAD
1233
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1234
+ //!
1235
+ //! @tparam ScanOp
1236
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1237
+ //!
1238
+ //! @param[in] input
1239
+ //! Calling thread's input items
1240
+ //!
1241
+ //! @param[out] output
1242
+ //! Calling thread's output items (may be aliased to `input`)
1243
+ //!
1244
+ //! @param[in] scan_op
1245
+ //! Binary scan functor
1246
+ //!
1247
+ //! @param[out] block_aggregate
1248
+ //! block-wide aggregate reduction of input items
1249
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1250
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1251
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1252
+ {
1253
+ // Reduce consecutive thread items in registers
1254
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1255
+
1256
+ // Exclusive thread block-scan
1257
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1258
+
1259
+ // Exclusive scan in registers with prefix
1260
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1261
+ }
1262
+
1263
+ //! @} end member group // Exclusive prefix scans (no initial value, multiple data per thread)
1264
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1265
+
1266
+ //! @name Inclusive prefix sum operations
1267
+ //! @{
1268
+
1269
+ //! @rst
1270
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1271
+ //! as the scan operator. Each thread contributes one input element.
1272
+ //!
1273
+ //! - @rowmajor
1274
+ //! - @smemreuse
1275
+ //!
1276
+ //! Snippet
1277
+ //! +++++++
1278
+ //!
1279
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1280
+ //! are partitioned across 128 threads.
1281
+ //!
1282
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1283
+ //! :language: c++
1284
+ //! :dedent:
1285
+ //! :start-after: example-begin inclusive-sum-single
1286
+ //! :end-before: example-end inclusive-sum-single
1287
+ //!
1288
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1289
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1290
+ //!
1291
+ //! @endrst
1292
+ //!
1293
+ //! @param[in] input
1294
+ //! Calling thread's input item
1295
+ //!
1296
+ //! @param[out] output
1297
+ //! Calling thread's output item (may be aliased to `input`)
1298
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1299
+ {
1300
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1301
+ }
1302
+
1303
+ //! @rst
1304
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1305
+ //! Each thread contributes one input element.
1306
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1307
+ //!
1308
+ //! - @rowmajor
1309
+ //! - @smemreuse
1310
+ //!
1311
+ //! Snippet
1312
+ //! +++++++
1313
+ //!
1314
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1315
+ //! are partitioned across 128 threads.
1316
+ //!
1317
+ //! .. code-block:: c++
1318
+ //!
1319
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1320
+ //!
1321
+ //! __global__ void ExampleKernel(...)
1322
+ //! {
1323
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1324
+ //! using BlockScan = cub::BlockScan<int, 128>;
1325
+ //!
1326
+ //! // Allocate shared memory for BlockScan
1327
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1328
+ //!
1329
+ //! // Obtain input item for each thread
1330
+ //! int thread_data;
1331
+ //! ...
1332
+ //!
1333
+ //! // Collectively compute the block-wide inclusive prefix sum
1334
+ //! int block_aggregate;
1335
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1336
+ //!
1337
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1338
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1339
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1340
+ //!
1341
+ //! @endrst
1342
+ //!
1343
+ //! @param[in] input
1344
+ //! Calling thread's input item
1345
+ //!
1346
+ //! @param[out] output
1347
+ //! Calling thread's output item (may be aliased to `input`)
1348
+ //!
1349
+ //! @param[out] block_aggregate
1350
+ //! block-wide aggregate reduction of input items
1351
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1352
+ {
1353
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1354
+ }
1355
+
1356
+ //! @rst
1357
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1358
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1359
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1360
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1361
+ //! scan inputs.
1362
+ //!
1363
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1364
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1365
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1366
+ //! - @rowmajor
1367
+ //! - @smemreuse
1368
+ //!
1369
+ //! Snippet
1370
+ //! +++++++
1371
+ //!
1372
+ //! The code snippet below illustrates a single thread block that progressively
1373
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1374
+ //! prefix functor to maintain a running total between block-wide scans.
1375
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1376
+ //!
1377
+ //! .. code-block:: c++
1378
+ //!
1379
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1380
+ //!
1381
+ //! // A stateful callback functor that maintains a running prefix to be applied
1382
+ //! // during consecutive scan operations.
1383
+ //! struct BlockPrefixCallbackOp
1384
+ //! {
1385
+ //! // Running prefix
1386
+ //! int running_total;
1387
+ //!
1388
+ //! // Constructor
1389
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1390
+ //!
1391
+ //! // Callback operator to be entered by the first warp of threads in the block.
1392
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1393
+ //! __device__ int operator()(int block_aggregate)
1394
+ //! {
1395
+ //! int old_prefix = running_total;
1396
+ //! running_total += block_aggregate;
1397
+ //! return old_prefix;
1398
+ //! }
1399
+ //! };
1400
+ //!
1401
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1402
+ //! {
1403
+ //! // Specialize BlockScan for a 1D block of 128 threads
1404
+ //! using BlockScan = cub::BlockScan<int, 128>;
1405
+ //!
1406
+ //! // Allocate shared memory for BlockScan
1407
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1408
+ //!
1409
+ //! // Initialize running total
1410
+ //! BlockPrefixCallbackOp prefix_op(0);
1411
+ //!
1412
+ //! // Have the block iterate over segments of items
1413
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1414
+ //! {
1415
+ //! // Load a segment of consecutive items that are blocked across threads
1416
+ //! int thread_data = d_data[block_offset + threadIdx.x];
1417
+ //!
1418
+ //! // Collectively compute the block-wide inclusive prefix sum
1419
+ //! BlockScan(temp_storage).InclusiveSum(
1420
+ //! thread_data, thread_data, prefix_op);
1421
+ //! __syncthreads();
1422
+ //!
1423
+ //! // Store scanned items to output segment
1424
+ //! d_data[block_offset + threadIdx.x] = thread_data;
1425
+ //! }
1426
+ //!
1427
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1428
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1429
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1430
+ //!
1431
+ //! @endrst
1432
+ //!
1433
+ //! @tparam BlockPrefixCallbackOp
1434
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1435
+ //!
1436
+ //! @param[in] input
1437
+ //! Calling thread's input item
1438
+ //!
1439
+ //! @param[out] output
1440
+ //! Calling thread's output item (may be aliased to `input`)
1441
+ //!
1442
+ //! @param[in,out] block_prefix_callback_op
1443
+ //! @rst
1444
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1445
+ //! to the logical input sequence.
1446
+ //! @endrst
1447
+ template <typename BlockPrefixCallbackOp>
1448
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1449
+ {
1450
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1451
+ }
1452
+
1453
+ //! @} end member group
1454
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1455
+ //! @{
1456
+
1457
+ //! @rst
1458
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1459
+ //! Each thread contributes an array of consecutive input elements.
1460
+ //!
1461
+ //! - @blocked
1462
+ //! - @granularity
1463
+ //! - @smemreuse
1464
+ //!
1465
+ //! Snippet
1466
+ //! +++++++
1467
+ //!
1468
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1469
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1470
+ //! where each thread owns 4 consecutive items.
1471
+ //!
1472
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1473
+ //! :language: c++
1474
+ //! :dedent:
1475
+ //! :start-after: example-begin inclusive-sum-array
1476
+ //! :end-before: example-end inclusive-sum-array
1477
+ //!
1478
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1479
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1480
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1481
+ //!
1482
+ //! @endrst
1483
+ //!
1484
+ //! @tparam ITEMS_PER_THREAD
1485
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1486
+ //!
1487
+ //! @param[in] input
1488
+ //! Calling thread's input items
1489
+ //!
1490
+ //! @param[out] output
1491
+ //! Calling thread's output items (may be aliased to `input`)
1492
+ template <int ITEMS_PER_THREAD>
1493
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1494
+ {
1495
+ if constexpr (ITEMS_PER_THREAD == 1)
1496
+ {
1497
+ InclusiveSum(input[0], output[0]);
1498
+ }
1499
+ else
1500
+ {
1501
+ // Reduce consecutive thread items in registers
1502
+ ::cuda::std::plus<> scan_op;
1503
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1504
+
1505
+ // Exclusive thread block-scan
1506
+ ExclusiveSum(thread_prefix, thread_prefix);
1507
+
1508
+ // Inclusive scan in registers with prefix as seed
1509
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1510
+ }
1511
+ }
1512
+
1513
+ //! @rst
1514
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1515
+ //! Each thread contributes an array of consecutive input elements.
1516
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1517
+ //!
1518
+ //! - @blocked
1519
+ //! - @granularity
1520
+ //! - @smemreuse
1521
+ //!
1522
+ //! Snippet
1523
+ //! +++++++
1524
+ //!
1525
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1526
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1527
+ //! where each thread owns 4 consecutive items.
1528
+ //!
1529
+ //! .. code-block:: c++
1530
+ //!
1531
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1532
+ //!
1533
+ //! __global__ void ExampleKernel(...)
1534
+ //! {
1535
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1536
+ //! using BlockScan = cub::BlockScan<int, 128>;
1537
+ //!
1538
+ //! // Allocate shared memory for BlockScan
1539
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1540
+ //!
1541
+ //! // Obtain a segment of consecutive items that are blocked across threads
1542
+ //! int thread_data[4];
1543
+ //! ...
1544
+ //!
1545
+ //! // Collectively compute the block-wide inclusive prefix sum
1546
+ //! int block_aggregate;
1547
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1548
+ //!
1549
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1550
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1551
+ //! corresponding output ``thread_data`` in those threads will be
1552
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1553
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1554
+ //!
1555
+ //! @endrst
1556
+ //!
1557
+ //! @tparam ITEMS_PER_THREAD
1558
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1559
+ //!
1560
+ //! @param[in] input
1561
+ //! Calling thread's input items
1562
+ //!
1563
+ //! @param[out] output
1564
+ //! Calling thread's output items (may be aliased to `input`)
1565
+ //!
1566
+ //! @param[out] block_aggregate
1567
+ //! block-wide aggregate reduction of input items
1568
+ template <int ITEMS_PER_THREAD>
1569
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1570
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1571
+ {
1572
+ if constexpr (ITEMS_PER_THREAD == 1)
1573
+ {
1574
+ InclusiveSum(input[0], output[0], block_aggregate);
1575
+ }
1576
+ else
1577
+ {
1578
+ // Reduce consecutive thread items in registers
1579
+ ::cuda::std::plus<> scan_op;
1580
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1581
+
1582
+ // Exclusive thread block-scan
1583
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1584
+
1585
+ // Inclusive scan in registers with prefix as seed
1586
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1587
+ }
1588
+ }
1589
+
1590
+ //! @rst
1591
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1592
+ //! Each thread contributes an array of consecutive input elements.
1593
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1594
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1595
+ //! value that logically prefixes the thread block's scan inputs.
1596
+ //!
1597
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1598
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1599
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1600
+ //! - @blocked
1601
+ //! - @granularity
1602
+ //! - @smemreuse
1603
+ //!
1604
+ //! Snippet
1605
+ //! +++++++
1606
+ //!
1607
+ //! The code snippet below illustrates a single thread block that progressively
1608
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1609
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1610
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1611
+ //! across 128 threads where each thread owns 4 consecutive items.
1612
+ //!
1613
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1614
+ //! :language: c++
1615
+ //! :dedent:
1616
+ //! :start-after: example-begin block-prefix-callback-op
1617
+ //! :end-before: example-end block-prefix-callback-op
1618
+ //!
1619
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1620
+ //! :language: c++
1621
+ //! :dedent:
1622
+ //! :start-after: example-begin inclusive-scan-prefix-callback
1623
+ //! :end-before: example-end inclusive-scan-prefix-callback
1624
+ //!
1625
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1626
+ //! The corresponding output for the first segment will be
1627
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1628
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1629
+ //!
1630
+ //! @endrst
1631
+ //!
1632
+ //! @tparam ITEMS_PER_THREAD
1633
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1634
+ //!
1635
+ //! @tparam BlockPrefixCallbackOp
1636
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1637
+ //!
1638
+ //! @param[in] input
1639
+ //! Calling thread's input items
1640
+ //!
1641
+ //! @param[out] output
1642
+ //! Calling thread's output items (may be aliased to `input`)
1643
+ //!
1644
+ //! @param[in,out] block_prefix_callback_op
1645
+ //! @rst
1646
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1647
+ //! logical input sequence.
1648
+ //! @endrst
1649
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1650
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1651
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1652
+ {
1653
+ if constexpr (ITEMS_PER_THREAD == 1)
1654
+ {
1655
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1656
+ }
1657
+ else
1658
+ {
1659
+ // Reduce consecutive thread items in registers
1660
+ ::cuda::std::plus<> scan_op;
1661
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1662
+
1663
+ // Exclusive thread block-scan
1664
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1665
+
1666
+ // Inclusive scan in registers with prefix as seed
1667
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1668
+ }
1669
+ }
1670
+
1671
+ //! @} end member group
1672
+ //! @name Inclusive prefix scan operations
1673
+ //! @{
1674
+
1675
+ //! @rst
1676
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1677
+ //! Each thread contributes one input element.
1678
+ //!
1679
+ //! - Supports non-commutative scan operators.
1680
+ //! - @rowmajor
1681
+ //! - @smemreuse
1682
+ //!
1683
+ //! Snippet
1684
+ //! +++++++
1685
+ //!
1686
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1687
+ //! are partitioned across 128 threads.
1688
+ //!
1689
+ //! .. code-block:: c++
1690
+ //!
1691
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1692
+ //!
1693
+ //! __global__ void ExampleKernel(...)
1694
+ //! {
1695
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1696
+ //! using BlockScan = cub::BlockScan<int, 128>;
1697
+ //!
1698
+ //! // Allocate shared memory for BlockScan
1699
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1700
+ //!
1701
+ //! // Obtain input item for each thread
1702
+ //! int thread_data;
1703
+ //! ...
1704
+ //!
1705
+ //! // Collectively compute the block-wide inclusive prefix max scan
1706
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1707
+ //!
1708
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1709
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1710
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1711
+ //!
1712
+ //! @endrst
1713
+ //!
1714
+ //! @tparam ScanOp
1715
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1716
+ //!
1717
+ //! @param[in] input
1718
+ //! Calling thread's input item
1719
+ //!
1720
+ //! @param[out] output
1721
+ //! Calling thread's output item (may be aliased to `input`)
1722
+ //!
1723
+ //! @param[in] scan_op
1724
+ //! Binary scan functor
1725
+ template <typename ScanOp>
1726
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1727
+ {
1728
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1729
+ }
1730
+
1731
+ //! @rst
1732
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1733
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1734
+ //! ``block_aggregate`` of all inputs.
1735
+ //!
1736
+ //! - Supports non-commutative scan operators.
1737
+ //! - @rowmajor
1738
+ //! - @smemreuse
1739
+ //!
1740
+ //! Snippet
1741
+ //! +++++++
1742
+ //!
1743
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
1744
+ //! integer items that are partitioned across 128 threads.
1745
+ //!
1746
+ //! .. code-block:: c++
1747
+ //!
1748
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1749
+ //!
1750
+ //! __global__ void ExampleKernel(...)
1751
+ //! {
1752
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1753
+ //! using BlockScan = cub::BlockScan<int, 128>;
1754
+ //!
1755
+ //! // Allocate shared memory for BlockScan
1756
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1757
+ //!
1758
+ //! // Obtain input item for each thread
1759
+ //! int thread_data;
1760
+ //! ...
1761
+ //!
1762
+ //! // Collectively compute the block-wide inclusive prefix max scan
1763
+ //! int block_aggregate;
1764
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
1765
+ //!
1766
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1767
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1768
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
1769
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
1770
+ //!
1771
+ //! @endrst
1772
+ //!
1773
+ //! @tparam ScanOp
1774
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1775
+ //!
1776
+ //! @param[in] input
1777
+ //! Calling thread's input item
1778
+ //!
1779
+ //! @param[out] output
1780
+ //! Calling thread's output item (may be aliased to `input`)
1781
+ //!
1782
+ //! @param[in] scan_op
1783
+ //! Binary scan functor
1784
+ //!
1785
+ //! @param[out] block_aggregate
1786
+ //! Block-wide aggregate reduction of input items
1787
+ template <typename ScanOp>
1788
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1789
+ {
1790
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1791
+ }
1792
+
1793
+ //! @rst
1794
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1795
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
1796
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
1797
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
1798
+ //!
1799
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1800
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
1801
+ //! The functor will be invoked by the first warp of threads in the block,
1802
+ //! however only the return value from *lane*\ :sub:`0` is applied
1803
+ //! as the block-wide prefix. Can be stateful.
1804
+ //! - Supports non-commutative scan operators.
1805
+ //! - @rowmajor
1806
+ //! - @smemreuse
1807
+ //!
1808
+ //! Snippet
1809
+ //! +++++++
1810
+ //!
1811
+ //! The code snippet below illustrates a single thread block that progressively
1812
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
1813
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1814
+ //! of 128 integer items that are partitioned across 128 threads.
1815
+ //!
1816
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1817
+ //! :language: c++
1818
+ //! :dedent:
1819
+ //! :start-after: example-begin block-prefix-callback-max-op
1820
+ //! :end-before: example-end block-prefix-callback-max-op
1821
+ //!
1822
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1823
+ //! :language: c++
1824
+ //! :dedent:
1825
+ //! :start-after: example-begin inclusive-scan-prefix-callback-max
1826
+ //! :end-before: example-end inclusive-scan-prefix-callback-max
1827
+ //!
1828
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1829
+ //! The corresponding output for the first segment will be
1830
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
1831
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
1832
+ //!
1833
+ //! @endrst
1834
+ //!
1835
+ //! @tparam ScanOp
1836
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1837
+ //!
1838
+ //! @tparam BlockPrefixCallbackOp
1839
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1840
+ //!
1841
+ //! @param[in] input
1842
+ //! Calling thread's input item
1843
+ //!
1844
+ //! @param[out] output
1845
+ //! Calling thread's output item (may be aliased to `input`)
1846
+ //!
1847
+ //! @param[in] scan_op
1848
+ //! Binary scan functor
1849
+ //!
1850
+ //! @param[in,out] block_prefix_callback_op
1851
+ //! @rst
1852
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1853
+ //! the logical input sequence.
1854
+ //! @endrst
1855
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
1856
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1857
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
1858
+ {
1859
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
1860
+ }
1861
+
1862
+ //! @} end member group
1863
+ //! @name Inclusive prefix scan operations (multiple data per thread)
1864
+ //! @{
1865
+
1866
+ //! @rst
1867
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1868
+ //! Each thread contributes an array of consecutive input elements.
1869
+ //!
1870
+ //! - Supports non-commutative scan operators.
1871
+ //! - @blocked
1872
+ //! - @granularity
1873
+ //! - @smemreuse
1874
+ //!
1875
+ //! Snippet
1876
+ //! +++++++
1877
+ //!
1878
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
1879
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
1880
+ //! where each thread owns 4 consecutive items.
1881
+ //!
1882
+ //! .. code-block:: c++
1883
+ //!
1884
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1885
+ //!
1886
+ //! __global__ void ExampleKernel(...)
1887
+ //! {
1888
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1889
+ //! using BlockScan = cub::BlockScan<int, 128>;
1890
+ //!
1891
+ //! // Allocate shared memory for BlockScan
1892
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1893
+ //!
1894
+ //! // Obtain a segment of consecutive items that are blocked across threads
1895
+ //! int thread_data[4];
1896
+ //! ...
1897
+ //!
1898
+ //! // Collectively compute the block-wide inclusive prefix max scan
1899
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1900
+ //!
1901
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1902
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1903
+ //! The corresponding output ``thread_data`` in those threads will be
1904
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
1905
+ //!
1906
+ //! @endrst
1907
+ //!
1908
+ //! @tparam ITEMS_PER_THREAD
1909
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1910
+ //!
1911
+ //! @tparam ScanOp
1912
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1913
+ //!
1914
+ //! @param[in] input
1915
+ //! Calling thread's input items
1916
+ //!
1917
+ //! @param[out] output
1918
+ //! Calling thread's output items (may be aliased to `input`)
1919
+ //!
1920
+ //! @param[in] scan_op
1921
+ //! Binary scan functor
1922
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1923
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1924
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1925
+ {
1926
+ if constexpr (ITEMS_PER_THREAD == 1)
1927
+ {
1928
+ InclusiveScan(input[0], output[0], scan_op);
1929
+ }
1930
+ else
1931
+ {
1932
+ // Reduce consecutive thread items in registers
1933
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1934
+
1935
+ // Exclusive thread block-scan
1936
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
1937
+
1938
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
1939
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1940
+ }
1941
+ }
1942
+
1943
+ //! @rst
1944
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1945
+ //! Each thread contributes an array of consecutive input elements.
1946
+ //!
1947
+ //! - Supports non-commutative scan operators.
1948
+ //! - @blocked
1949
+ //! - @granularity
1950
+ //! - @smemreuse
1951
+ //!
1952
+ //! Snippet
1953
+ //! +++++++
1954
+ //!
1955
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1956
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
1957
+ //! where each thread owns 2 consecutive items.
1958
+ //!
1959
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
1960
+ //! :language: c++
1961
+ //! :dedent:
1962
+ //! :start-after: example-begin inclusive-scan-array-init-value
1963
+ //! :end-before: example-end inclusive-scan-array-init-value
1964
+ //!
1965
+ //!
1966
+ //! @endrst
1967
+ //!
1968
+ //! @tparam ITEMS_PER_THREAD
1969
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1970
+ //!
1971
+ //! @tparam ScanOp
1972
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1973
+ //!
1974
+ //! @param[in] input
1975
+ //! Calling thread's input items
1976
+ //!
1977
+ //! @param[out] output
1978
+ //! Calling thread's output items (may be aliased to `input`)
1979
+ //!
1980
+ //! @param[in] initial_value
1981
+ //! Initial value to seed the inclusive scan (uniform across block)
1982
+ //!
1983
+ //! @param[in] scan_op
1984
+ //! Binary scan functor
1985
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1986
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1987
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
1988
+ {
1989
+ // Reduce consecutive thread items in registers
1990
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1991
+
1992
+ // Exclusive thread block-scan
1993
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
1994
+
1995
+ // Exclusive scan in registers with prefix as seed
1996
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1997
+ }
1998
+
1999
+ //! @rst
2000
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2001
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2002
+ //! with the block-wide ``block_aggregate`` of all inputs.
2003
+ //!
2004
+ //! - Supports non-commutative scan operators.
2005
+ //! - @blocked
2006
+ //! - @granularity
2007
+ //! - @smemreuse
2008
+ //!
2009
+ //! Snippet
2010
+ //! +++++++
2011
+ //!
2012
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2013
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2014
+ //! where each thread owns 4 consecutive items.
2015
+ //!
2016
+ //! .. code-block:: c++
2017
+ //!
2018
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2019
+ //!
2020
+ //! __global__ void ExampleKernel(...)
2021
+ //! {
2022
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2023
+ //! using BlockScan = cub::BlockScan<int, 128>;
2024
+ //!
2025
+ //! // Allocate shared memory for BlockScan
2026
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2027
+ //!
2028
+ //! // Obtain a segment of consecutive items that are blocked across threads
2029
+ //! int thread_data[4];
2030
+ //! ...
2031
+ //!
2032
+ //! // Collectively compute the block-wide inclusive prefix max scan
2033
+ //! int block_aggregate;
2034
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2035
+ //!
2036
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2037
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2038
+ //! The corresponding output ``thread_data`` in those threads will be
2039
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2040
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
2041
+ //!
2042
+ //! @endrst
2043
+ //!
2044
+ //! @tparam ITEMS_PER_THREAD
2045
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2046
+ //!
2047
+ //! @tparam ScanOp
2048
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2049
+ //!
2050
+ //! @param[in] input
2051
+ //! Calling thread's input items
2052
+ //!
2053
+ //! @param[out] output
2054
+ //! Calling thread's output items (may be aliased to `input`)
2055
+ //!
2056
+ //! @param[in] scan_op
2057
+ //! Binary scan functor
2058
+ //!
2059
+ //! @param[out] block_aggregate
2060
+ //! Block-wide aggregate reduction of input items
2061
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2062
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2063
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
2064
+ {
2065
+ if (ITEMS_PER_THREAD == 1)
2066
+ {
2067
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2068
+ }
2069
+ else
2070
+ {
2071
+ // Reduce consecutive thread items in registers
2072
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2073
+
2074
+ // Exclusive thread block-scan (with no initial value)
2075
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
2076
+
2077
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2078
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2079
+ }
2080
+ }
2081
+
2082
+ //! @rst
2083
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2084
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2085
+ //! with the block-wide ``block_aggregate`` of all inputs.
2086
+ //!
2087
+ //! - Supports non-commutative scan operators.
2088
+ //! - @blocked
2089
+ //! - @granularity
2090
+ //! - @smemreuse
2091
+ //!
2092
+ //! Snippet
2093
+ //! +++++++
2094
+ //!
2095
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2096
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2097
+ //! where each thread owns 2 consecutive items.
2098
+ //!
2099
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2100
+ //! :language: c++
2101
+ //! :dedent:
2102
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
2103
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
2104
+ //!
2105
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
2106
+ //!
2107
+ //! .. note::
2108
+ //!
2109
+ //! ``initial_value`` is not applied to the block-wide aggregate.
2110
+ //!
2111
+ //! @endrst
2112
+ //!
2113
+ //! @tparam ITEMS_PER_THREAD
2114
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2115
+ //!
2116
+ //! @tparam ScanOp
2117
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2118
+ //!
2119
+ //! @param[in] input
2120
+ //! Calling thread's input items
2121
+ //!
2122
+ //! @param[out] output
2123
+ //! Calling thread's output items (may be aliased to `input`)
2124
+ //!
2125
+ //! @param[in] initial_value
2126
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2127
+ //! into account for ``block_aggregate``.
2128
+ //!
2129
+ //! @param[in] scan_op
2130
+ //! Binary scan functor
2131
+ //!
2132
+ //! @param[out] block_aggregate
2133
+ //! Block-wide aggregate reduction of input items
2134
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2135
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2136
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2137
+ {
2138
+ // Reduce consecutive thread items in registers
2139
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2140
+
2141
+ // Exclusive thread block-scan
2142
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2143
+
2144
+ // Exclusive scan in registers with prefix as seed
2145
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2146
+ }
2147
+
2148
+ //! @rst
2149
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2150
+ //! Each thread contributes an array of consecutive input elements.
2151
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2152
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2153
+ //! thread block's scan inputs.
2154
+ //!
2155
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2156
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2157
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2158
+ //! - Supports non-commutative scan operators.
2159
+ //! - @blocked
2160
+ //! - @granularity
2161
+ //! - @smemreuse
2162
+ //!
2163
+ //! Snippet
2164
+ //! +++++++
2165
+ //!
2166
+ //! The code snippet below illustrates a single thread block that progressively
2167
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2168
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2169
+ //! of 128 integer items that are partitioned across 128 threads.
2170
+ //!
2171
+ //! .. code-block:: c++
2172
+ //!
2173
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2174
+ //!
2175
+ //! // A stateful callback functor that maintains a running prefix to be applied
2176
+ //! // during consecutive scan operations.
2177
+ //! struct BlockPrefixCallbackOp
2178
+ //! {
2179
+ //! // Running prefix
2180
+ //! int running_total;
2181
+ //!
2182
+ //! // Constructor
2183
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2184
+ //!
2185
+ //! // Callback operator to be entered by the first warp of threads in the block.
2186
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2187
+ //! __device__ int operator()(int block_aggregate)
2188
+ //! {
2189
+ //! int old_prefix = running_total;
2190
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2191
+ //! return old_prefix;
2192
+ //! }
2193
+ //! };
2194
+ //!
2195
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2196
+ //! {
2197
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2198
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2199
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2200
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2201
+ //!
2202
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2203
+ //! __shared__ union {
2204
+ //! typename BlockLoad::TempStorage load;
2205
+ //! typename BlockScan::TempStorage scan;
2206
+ //! typename BlockStore::TempStorage store;
2207
+ //! } temp_storage;
2208
+ //!
2209
+ //! // Initialize running total
2210
+ //! BlockPrefixCallbackOp prefix_op(0);
2211
+ //!
2212
+ //! // Have the block iterate over segments of items
2213
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2214
+ //! {
2215
+ //! // Load a segment of consecutive items that are blocked across threads
2216
+ //! int thread_data[4];
2217
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2218
+ //! __syncthreads();
2219
+ //!
2220
+ //! // Collectively compute the block-wide inclusive prefix max scan
2221
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2222
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2223
+ //! __syncthreads();
2224
+ //!
2225
+ //! // Store scanned items to output segment
2226
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2227
+ //! __syncthreads();
2228
+ //! }
2229
+ //!
2230
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2231
+ //! The corresponding output for the first segment will be
2232
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2233
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2234
+ //!
2235
+ //! @endrst
2236
+ //!
2237
+ //! @tparam ITEMS_PER_THREAD
2238
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2239
+ //!
2240
+ //! @tparam ScanOp
2241
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2242
+ //!
2243
+ //! @tparam BlockPrefixCallbackOp
2244
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2245
+ //!
2246
+ //! @param[in] input
2247
+ //! Calling thread's input items
2248
+ //!
2249
+ //! @param[out] output
2250
+ //! Calling thread's output items (may be aliased to `input`)
2251
+ //!
2252
+ //! @param[in] scan_op
2253
+ //! Binary scan functor
2254
+ //!
2255
+ //! @param[in,out] block_prefix_callback_op
2256
+ //! @rst
2257
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2258
+ //! the logical input sequence.
2259
+ //! @endrst
2260
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2261
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2262
+ T (&input)[ITEMS_PER_THREAD],
2263
+ T (&output)[ITEMS_PER_THREAD],
2264
+ ScanOp scan_op,
2265
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2266
+ {
2267
+ if (ITEMS_PER_THREAD == 1)
2268
+ {
2269
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2270
+ }
2271
+ else
2272
+ {
2273
+ // Reduce consecutive thread items in registers
2274
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2275
+
2276
+ // Exclusive thread block-scan
2277
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2278
+
2279
+ // Inclusive scan in registers with prefix as seed
2280
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2281
+ }
2282
+ }
2283
+
2284
+ //! @} end member group
2285
+ };
2286
+
2287
+ CUB_NAMESPACE_END