cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2168 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
7
+ //! sum/scan of items partitioned across a CUDA thread block.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/block/specializations/block_scan_raking.cuh>
22
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
23
+ #include <cub/util_ptx.cuh>
24
+ #include <cub/util_type.cuh>
25
+
26
+ #include <cuda/std/__functional/operations.h>
27
+ #include <cuda/std/__type_traits/conditional.h>
28
+
29
+ #if !_CCCL_COMPILER(NVRTC)
30
+ # include <ostream>
31
+ #endif // !_CCCL_COMPILER(NVRTC)
32
+
33
+ CUB_NAMESPACE_BEGIN
34
+
35
+ /******************************************************************************
36
+ * Algorithmic variants
37
+ ******************************************************************************/
38
+
39
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
40
+ //! parallel prefix scan across a CUDA thread block.
41
+ enum BlockScanAlgorithm
42
+ {
43
+
44
+ //! @rst
45
+ //! Overview
46
+ //! ++++++++++++++++++++++++++
47
+ //!
48
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
49
+ //!
50
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
51
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
52
+ //! #. Upsweep sequential reduction in shared memory.
53
+ //! Threads within a single warp rake across segments of shared partial reductions.
54
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
55
+ //! #. Downsweep sequential exclusive scan in shared memory.
56
+ //! Threads within a single warp rake across segments of shared partial reductions,
57
+ //! seeded with the warp-scan output.
58
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
59
+ //! seeded with the raking scan output.
60
+ //!
61
+ //! Performance Considerations
62
+ //! ++++++++++++++++++++++++++
63
+ //!
64
+ //! - Although this variant may suffer longer turnaround latencies when the
65
+ //! GPU is under-occupied, it can often provide higher overall throughput
66
+ //! across the GPU when suitably occupied.
67
+ //!
68
+ //! @endrst
69
+ BLOCK_SCAN_RAKING,
70
+
71
+ //! @rst
72
+ //! Overview
73
+ //! ++++++++++++++++++++++++++
74
+ //!
75
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
76
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
77
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
78
+ //!
79
+ //! @endrst
80
+ BLOCK_SCAN_RAKING_MEMOIZE,
81
+
82
+ //! @rst
83
+ //! Overview
84
+ //! ++++++++++++++++++++++++++
85
+ //!
86
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
87
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
88
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
89
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
90
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
91
+ //! from each preceding warp.
92
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
93
+ //! seeded with the raking scan output.
94
+ //!
95
+ //! Performance Considerations
96
+ //! ++++++++++++++++++++++++++
97
+ //!
98
+ //! - Although this variant may suffer lower overall throughput across the
99
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
100
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
101
+ //!
102
+ //! @endrst
103
+ BLOCK_SCAN_WARP_SCANS,
104
+ };
105
+
106
+ #if !_CCCL_COMPILER(NVRTC)
107
+ inline ::std::ostream& operator<<(::std::ostream& os, BlockScanAlgorithm algo)
108
+ {
109
+ switch (algo)
110
+ {
111
+ case BLOCK_SCAN_RAKING:
112
+ return os << "BLOCK_SCAN_RAKING";
113
+ case BLOCK_SCAN_RAKING_MEMOIZE:
114
+ return os << "BLOCK_SCAN_RAKING_MEMOIZE";
115
+ case BLOCK_SCAN_WARP_SCANS:
116
+ return os << "BLOCK_SCAN_WARP_SCANS";
117
+ default:
118
+ return os << "<unknown BlockScanAlgorithm: " << static_cast<int>(algo) << ">";
119
+ }
120
+ }
121
+ #endif // !_CCCL_COMPILER(NVRTC)
122
+
123
+ //! @rst
124
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
125
+ //! sum/scan of items partitioned across a CUDA thread block.
126
+ //!
127
+ //! Overview
128
+ //! +++++++++++++++++++++++++++++++++++++++++++++
129
+ //!
130
+ //! - Given a list of input elements and a binary reduction operator, a
131
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
132
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
133
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
134
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
135
+ //! the *i*\ :sup:`th` output reduction.
136
+ //! - @rowmajor
137
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
138
+ //!
139
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
140
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
141
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
142
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
143
+ //! register pressure for intermediate storage.
144
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
145
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
146
+ //!
147
+ //! Performance Considerations
148
+ //! +++++++++++++++++++++++++++++++++++++++++++++
149
+ //!
150
+ //! - @granularity
151
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
152
+ //! - Uses synchronization-free communication between warp lanes when applicable
153
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
154
+ //! one or two depending on algorithm selection)
155
+ //! - Incurs zero bank conflicts for most types
156
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
157
+ //!
158
+ //! - Prefix sum variants (vs. generic scan)
159
+ //! - @blocksize
160
+ //!
161
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
162
+ //!
163
+ //! A Simple Example
164
+ //! +++++++++++++++++++++++++++++++++++++++++++++
165
+ //!
166
+ //! @blockcollective{BlockScan}
167
+ //!
168
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
169
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
170
+ //! where each thread owns 4 consecutive items.
171
+ //!
172
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
173
+ //! :language: c++
174
+ //! :dedent:
175
+ //! :start-after: example-begin exclusive-sum-array
176
+ //! :end-before: example-end exclusive-sum-array
177
+ //!
178
+ //! Suppose the set of input ``thread_data`` across the block of threads is
179
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
180
+ //! The corresponding output ``thread_data`` in those threads will be
181
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
182
+ //!
183
+ //! Re-using dynamically allocating shared memory
184
+ //! +++++++++++++++++++++++++++++++++++++++++++++
185
+ //!
186
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
187
+ //! BlockReduce and how to re-purpose the same memory region.
188
+ //! This example can be easily adapted to the storage required by BlockScan.
189
+ //!
190
+ //! @endrst
191
+ //!
192
+ //! @tparam T
193
+ //! Data type being scanned
194
+ //!
195
+ //! @tparam BlockDimX
196
+ //! The thread block length in threads along the X dimension
197
+ //!
198
+ //! @tparam Algorithm
199
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
200
+ //! (default: cub::BLOCK_SCAN_RAKING)
201
+ //!
202
+ //! @tparam BlockDimY
203
+ //! **[optional]** The thread block length in threads along the Y dimension
204
+ //! (default: 1)
205
+ //!
206
+ //! @tparam BlockDimZ
207
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
208
+ //!
209
+ template <typename T, int BlockDimX, BlockScanAlgorithm Algorithm = BLOCK_SCAN_RAKING, int BlockDimY = 1, int BlockDimZ = 1>
210
+ class BlockScan
211
+ {
212
+ private:
213
+ /// The thread block size in threads
214
+ static constexpr int BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ;
215
+
216
+ /**
217
+ * Ensure the template parameterization meets the requirements of the
218
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
219
+ * cannot be used with thread block sizes not a multiple of the
220
+ * architectural warp size.
221
+ */
222
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
223
+ ((Algorithm == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
224
+ ? BLOCK_SCAN_RAKING
225
+ : Algorithm;
226
+
227
+ using WarpScans = detail::BlockScanWarpScans<T, BlockDimX, BlockDimY, BlockDimZ>;
228
+ using Raking =
229
+ detail::BlockScanRaking<T, BlockDimX, BlockDimY, BlockDimZ, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
230
+
231
+ /// Define the delegate type for the desired algorithm
232
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
233
+
234
+ /// Shared memory storage layout type for BlockScan
235
+ using _TempStorage = typename InternalBlockScan::TempStorage;
236
+
237
+ /// Shared storage reference
238
+ _TempStorage& temp_storage;
239
+
240
+ /// Linear thread-id
241
+ unsigned int linear_tid;
242
+
243
+ /// Internal storage allocator
244
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
245
+ {
246
+ __shared__ _TempStorage private_storage;
247
+ return private_storage;
248
+ }
249
+
250
+ public:
251
+ /// @smemstorage{BlockScan}
252
+ struct TempStorage : Uninitialized<_TempStorage>
253
+ {};
254
+
255
+ //! @name Collective constructors
256
+ //! @{
257
+
258
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
259
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
260
+ : temp_storage(PrivateStorage())
261
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
262
+ {}
263
+
264
+ /**
265
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
266
+ *
267
+ * @param[in] temp_storage
268
+ * Reference to memory allocation having layout type TempStorage
269
+ */
270
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
271
+ : temp_storage(temp_storage.Alias())
272
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
273
+ {}
274
+
275
+ //! @} end member group
276
+ //! @name Exclusive prefix sum operations
277
+ //! @{
278
+
279
+ //! @rst
280
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
281
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
282
+ //! to ``output`` in *thread*\ :sub:`0`.
283
+ //!
284
+ //! - @identityzero
285
+ //! - @rowmajor
286
+ //! - @smemreuse
287
+ //!
288
+ //! Snippet
289
+ //! +++++++
290
+ //!
291
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
292
+ //! are partitioned across 128 threads.
293
+ //!
294
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
295
+ //! :language: c++
296
+ //! :dedent:
297
+ //! :start-after: example-begin exclusive-sum-single
298
+ //! :end-before: example-end exclusive-sum-single
299
+ //!
300
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
301
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
302
+ //!
303
+ //! @endrst
304
+ //!
305
+ //! @param[in] input
306
+ //! Calling thread's input item
307
+ //!
308
+ //! @param[out] output
309
+ //! Calling thread's output item (may be aliased to `input`)
310
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
311
+ {
312
+ T initial_value{};
313
+
314
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
315
+ }
316
+
317
+ //! @rst
318
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
319
+ //! Each thread contributes one input element.
320
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
321
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
322
+ //!
323
+ //! - @identityzero
324
+ //! - @rowmajor
325
+ //! - @smemreuse
326
+ //!
327
+ //! Snippet
328
+ //! +++++++
329
+ //!
330
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
331
+ //! are partitioned across 128 threads.
332
+ //!
333
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
334
+ //! :language: c++
335
+ //! :dedent:
336
+ //! :start-after: example-begin exclusive-sum-aggregate
337
+ //! :end-before: example-end exclusive-sum-aggregate
338
+ //!
339
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
340
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
341
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
342
+ //!
343
+ //! @endrst
344
+ //!
345
+ //! @param[in] input
346
+ //! Calling thread's input item
347
+ //!
348
+ //! @param[out] output
349
+ //! Calling thread's output item (may be aliased to `input`)
350
+ //!
351
+ //! @param[out] block_aggregate
352
+ //! block-wide aggregate reduction of input items
353
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
354
+ {
355
+ T initial_value{};
356
+
357
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
358
+ }
359
+
360
+ //! @rst
361
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
362
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
363
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
364
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
365
+ //! scan inputs.
366
+ //!
367
+ //! - @identityzero
368
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
369
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
370
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
371
+ //! - @rowmajor
372
+ //! - @smemreuse
373
+ //!
374
+ //! Snippet
375
+ //! +++++++
376
+ //!
377
+ //! The code snippet below illustrates a single thread block that progressively
378
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
379
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
380
+ //! of 128 integer items that are partitioned across 128 threads.
381
+ //!
382
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
383
+ //! :language: c++
384
+ //! :dedent:
385
+ //! :start-after: example-begin block-prefix-callback-op
386
+ //! :end-before: example-end block-prefix-callback-op
387
+ //!
388
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
389
+ //! :language: c++
390
+ //! :dedent:
391
+ //! :start-after: example-begin exclusive-sum-single-prefix-callback
392
+ //! :end-before: example-end exclusive-sum-single-prefix-callback
393
+ //!
394
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
395
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
396
+ //! The output for the second segment will be ``128, 129, ..., 255``.
397
+ //!
398
+ //! @endrst
399
+ //!
400
+ //! @tparam BlockPrefixCallbackOp
401
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
402
+ //!
403
+ //! @param[in] input
404
+ //! Calling thread's input item
405
+ //!
406
+ //! @param[out] output
407
+ //! Calling thread's output item (may be aliased to `input`)
408
+ //!
409
+ //! @param[in,out] block_prefix_callback_op
410
+ //! @rst
411
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
412
+ //! the logical input sequence.
413
+ //! @endrst
414
+ template <typename BlockPrefixCallbackOp>
415
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
416
+ {
417
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
418
+ }
419
+
420
+ //! @} end member group
421
+ //! @name Exclusive prefix sum operations (multiple data per thread)
422
+ //! @{
423
+
424
+ //! @rst
425
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
426
+ //! Each thread contributes an array of consecutive input elements.
427
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
428
+ //!
429
+ //! - @identityzero
430
+ //! - @blocked
431
+ //! - @granularity
432
+ //! - @smemreuse
433
+ //!
434
+ //! Snippet
435
+ //! +++++++
436
+ //!
437
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
438
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
439
+ //! where each thread owns 4 consecutive items.
440
+ //!
441
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
442
+ //! :language: c++
443
+ //! :dedent:
444
+ //! :start-after: example-begin exclusive-sum-array
445
+ //! :end-before: example-end exclusive-sum-array
446
+ //!
447
+ //! Suppose the set of input ``thread_data`` across the block of threads is
448
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
449
+ //! The corresponding output ``thread_data`` in those threads will be
450
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
451
+ //!
452
+ //! @endrst
453
+ //!
454
+ //! @tparam ITEMS_PER_THREAD
455
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
456
+ //!
457
+ //! @param[in] input
458
+ //! Calling thread's input items
459
+ //!
460
+ //! @param[out] output
461
+ //! Calling thread's output items (may be aliased to `input`)
462
+ template <int ITEMS_PER_THREAD>
463
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
464
+ {
465
+ T initial_value{};
466
+
467
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
468
+ }
469
+
470
+ //! @rst
471
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
472
+ //! Each thread contributes an array of consecutive input elements.
473
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
474
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
475
+ //!
476
+ //! - @identityzero
477
+ //! - @blocked
478
+ //! - @granularity
479
+ //! - @smemreuse
480
+ //!
481
+ //! Snippet
482
+ //! +++++++
483
+ //!
484
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
485
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
486
+ //! 4 consecutive items.
487
+ //!
488
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
489
+ //! :language: c++
490
+ //! :dedent:
491
+ //! :start-after: example-begin exclusive-sum-array-aggregate
492
+ //! :end-before: example-end exclusive-sum-array-aggregate
493
+ //!
494
+ //! Suppose the set of input ``thread_data`` across the block of threads is
495
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
496
+ //! The corresponding output ``thread_data`` in those threads will be
497
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
498
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
499
+ //!
500
+ //! @endrst
501
+ //!
502
+ //! @tparam ITEMS_PER_THREAD
503
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
504
+ //!
505
+ //! @param[in] input
506
+ //! Calling thread's input items
507
+ //!
508
+ //! @param[out] output
509
+ //! Calling thread's output items (may be aliased to `input`)
510
+ //!
511
+ //! @param[out] block_aggregate
512
+ //! block-wide aggregate reduction of input items
513
+ template <int ITEMS_PER_THREAD>
514
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
515
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
516
+ {
517
+ // Reduce consecutive thread items in registers
518
+ T initial_value{};
519
+
520
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
521
+ }
522
+
523
+ //! @rst
524
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
525
+ //! Each thread contributes an array of consecutive input elements.
526
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
527
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
528
+ //! value that logically prefixes the thread block's scan inputs.
529
+ //!
530
+ //! - @identityzero
531
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
532
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
533
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
534
+ //! - @blocked
535
+ //! - @granularity
536
+ //! - @smemreuse
537
+ //!
538
+ //!
539
+ //! Snippet
540
+ //! +++++++
541
+ //!
542
+ //! The code snippet below illustrates a single thread block that progressively
543
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
544
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
545
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
546
+ //! across 128 threads where each thread owns 4 consecutive items.
547
+ //!
548
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
549
+ //! :language: c++
550
+ //! :dedent:
551
+ //! :start-after: example-begin block-prefix-callback-op
552
+ //! :end-before: example-end block-prefix-callback-op
553
+ //!
554
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
555
+ //! :language: c++
556
+ //! :dedent:
557
+ //! :start-after: example-begin exclusive-sum-prefix-callback
558
+ //! :end-before: example-end exclusive-sum-prefix-callback
559
+ //!
560
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
561
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
562
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
563
+ //!
564
+ //! @endrst
565
+ //!
566
+ //! @tparam ITEMS_PER_THREAD
567
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
568
+ //!
569
+ //! @tparam BlockPrefixCallbackOp
570
+ //! **[inferred]** Call-back functor type having member
571
+ //! `T operator()(T block_aggregate)`
572
+ //!
573
+ //! @param[in] input
574
+ //! Calling thread's input items
575
+ //!
576
+ //! @param[out] output
577
+ //! Calling thread's output items (may be aliased to `input`)
578
+ //!
579
+ //! @param[in,out] block_prefix_callback_op
580
+ //! @rst
581
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
582
+ //! the logical input sequence.
583
+ //! @endrst
584
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
585
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
586
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
587
+ {
588
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
589
+ }
590
+
591
+ //! @} end member group // Exclusive prefix sums (multiple data per thread)
592
+ //! @name Exclusive prefix scan operations
593
+ //! @{
594
+
595
+ //! @rst
596
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
597
+ //! Each thread contributes one input element.
598
+ //!
599
+ //! - Supports non-commutative scan operators.
600
+ //! - @rowmajor
601
+ //! - @smemreuse
602
+ //!
603
+ //! Snippet
604
+ //! +++++++
605
+ //!
606
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
607
+ //! are partitioned across 128 threads.
608
+ //!
609
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
610
+ //! :language: c++
611
+ //! :dedent:
612
+ //! :start-after: example-begin exclusive-scan-single
613
+ //! :end-before: example-end exclusive-scan-single
614
+ //!
615
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
616
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
617
+ //!
618
+ //! @endrst
619
+ //!
620
+ //! @tparam ScanOp
621
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
622
+ //!
623
+ //! @param[in] input
624
+ //! Calling thread's input item
625
+ //!
626
+ //! @param[out] output
627
+ //! Calling thread's output item (may be aliased to `input`)
628
+ //!
629
+ //! @param[in] initial_value
630
+ //! @rst
631
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
632
+ //! @endrst
633
+ //!
634
+ //! @param[in] scan_op
635
+ //! Binary scan functor
636
+ template <typename ScanOp>
637
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
638
+ {
639
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
640
+ }
641
+
642
+ //! @rst
643
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
644
+ //! Each thread contributes one input element.
645
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
646
+ //!
647
+ //! - Supports non-commutative scan operators.
648
+ //! - @rowmajor
649
+ //! - @smemreuse
650
+ //!
651
+ //! Snippet
652
+ //! +++++++
653
+ //!
654
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
655
+ //! are partitioned across 128 threads.
656
+ //!
657
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
658
+ //! :language: c++
659
+ //! :dedent:
660
+ //! :start-after: example-begin exclusive-scan-aggregate
661
+ //! :end-before: example-end exclusive-scan-aggregate
662
+ //!
663
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
664
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
665
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
666
+ //!
667
+ //! .. note::
668
+ //!
669
+ //! ``initial_value`` is not applied to the block-wide aggregate.
670
+ //!
671
+ //! @endrst
672
+ //!
673
+ //! @tparam ScanOp
674
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
675
+ //!
676
+ //! @param[in] input
677
+ //! Calling thread's input items
678
+ //!
679
+ //! @param[out] output
680
+ //! Calling thread's output items (may be aliased to ``input``)
681
+ //!
682
+ //! @param[in] initial_value
683
+ //! @rst
684
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
685
+ //! taken into account for ``block_aggregate``.
686
+ //!
687
+ //! @endrst
688
+ //!
689
+ //! @param[in] scan_op
690
+ //! Binary scan functor
691
+ //!
692
+ //! @param[out] block_aggregate
693
+ //! block-wide aggregate reduction of input items
694
+ template <typename ScanOp>
695
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
696
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
697
+ {
698
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
699
+ }
700
+
701
+ //! @rst
702
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
703
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
704
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
705
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
706
+ //!
707
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
708
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
709
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
710
+ //! - Supports non-commutative scan operators.
711
+ //! - @rowmajor
712
+ //! - @smemreuse
713
+ //!
714
+ //! Snippet
715
+ //! +++++++
716
+ //!
717
+ //! The code snippet below illustrates a single thread block that progressively
718
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
719
+ //! prefix functor to maintain a running total between block-wide scans.
720
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
721
+ //!
722
+ //! .. code-block:: c++
723
+ //!
724
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
725
+ //!
726
+ //! // A stateful callback functor that maintains a running prefix to be applied
727
+ //! // during consecutive scan operations.
728
+ //! struct BlockPrefixCallbackOp
729
+ //! {
730
+ //! // Running prefix
731
+ //! int running_total;
732
+ //!
733
+ //! // Constructor
734
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
735
+ //!
736
+ //! // Callback operator to be entered by the first warp of threads in the block.
737
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
738
+ //! __device__ int operator()(int block_aggregate)
739
+ //! {
740
+ //! int old_prefix = running_total;
741
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
742
+ //! return old_prefix;
743
+ //! }
744
+ //! };
745
+ //!
746
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
747
+ //! {
748
+ //! // Specialize BlockScan for a 1D block of 128 threads
749
+ //! using BlockScan = cub::BlockScan<int, 128>;
750
+ //!
751
+ //! // Allocate shared memory for BlockScan
752
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
753
+ //!
754
+ //! // Initialize running total
755
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
756
+ //!
757
+ //! // Have the block iterate over segments of items
758
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
759
+ //! {
760
+ //! // Load a segment of consecutive items that are blocked across threads
761
+ //! int thread_data = d_data[block_offset + threadIdx.x];
762
+ //!
763
+ //! // Collectively compute the block-wide exclusive prefix max scan
764
+ //! BlockScan(temp_storage).ExclusiveScan(
765
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
766
+ //! __syncthreads();
767
+ //!
768
+ //! // Store scanned items to output segment
769
+ //! d_data[block_offset + threadIdx.x] = thread_data;
770
+ //! }
771
+ //! }
772
+ //!
773
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
774
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
775
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
776
+ //!
777
+ //! @endrst
778
+ //!
779
+ //! @tparam ScanOp
780
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
781
+ //!
782
+ //! @tparam BlockPrefixCallbackOp
783
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
784
+ //!
785
+ //! @param[in] input
786
+ //! Calling thread's input item
787
+ //!
788
+ //! @param[out] output
789
+ //! Calling thread's output item (may be aliased to `input`)
790
+ //!
791
+ //! @param[in] scan_op
792
+ //! Binary scan functor
793
+ //!
794
+ //! @param[in,out] block_prefix_callback_op
795
+ //! @rst
796
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
797
+ //! the logical input sequence.
798
+ //! @endrst
799
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
800
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
801
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
802
+ {
803
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
804
+ }
805
+
806
+ //! @} end member group // Inclusive prefix sums
807
+ //! @name Exclusive prefix scan operations (multiple data per thread)
808
+ //! @{
809
+
810
+ //! @rst
811
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
812
+ //! Each thread contributes an array of consecutive input elements.
813
+ //!
814
+ //! - Supports non-commutative scan operators.
815
+ //! - @blocked
816
+ //! - @granularity
817
+ //! - @smemreuse
818
+ //!
819
+ //! Snippet
820
+ //! +++++++
821
+ //!
822
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
823
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
824
+ //! across 128 threads where each thread owns 4 consecutive items.
825
+ //!
826
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
827
+ //! :language: c++
828
+ //! :dedent:
829
+ //! :start-after: example-begin exclusive-scan-array
830
+ //! :end-before: example-end exclusive-scan-array
831
+ //!
832
+ //! Suppose the set of input ``thread_data`` across the block of threads is
833
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
834
+ //! The corresponding output ``thread_data`` in those threads will be
835
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
836
+ //!
837
+ //! @endrst
838
+ //!
839
+ //! @tparam ITEMS_PER_THREAD
840
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
841
+ //!
842
+ //! @tparam ScanOp
843
+ //! **[inferred]** Binary scan functor type having member
844
+ //! `T operator()(const T &a, const T &b)`
845
+ //!
846
+ //! @param[in] input
847
+ //! Calling thread's input items
848
+ //!
849
+ //! @param[out] output
850
+ //! Calling thread's output items (may be aliased to `input`)
851
+ //!
852
+ //! @param[in] initial_value
853
+ //! @rst
854
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
855
+ //! @endrst
856
+ //!
857
+ //! @param[in] scan_op
858
+ //! Binary scan functor
859
+ template <int ITEMS_PER_THREAD, typename ScanOp>
860
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
861
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
862
+ {
863
+ // Reduce consecutive thread items in registers
864
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
865
+
866
+ // Exclusive thread block-scan
867
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
868
+
869
+ // Exclusive scan in registers with prefix as seed
870
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
871
+ }
872
+
873
+ //! @rst
874
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
875
+ //! Each thread contributes an array of consecutive input elements.
876
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
877
+ //!
878
+ //! - Supports non-commutative scan operators.
879
+ //! - @blocked
880
+ //! - @granularity
881
+ //! - @smemreuse
882
+ //!
883
+ //! Snippet
884
+ //! +++++++
885
+ //!
886
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
887
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
888
+ //! 4 consecutive items.
889
+ //!
890
+ //! .. code-block:: c++
891
+ //!
892
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
893
+ //!
894
+ //! __global__ void ExampleKernel(...)
895
+ //! {
896
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
897
+ //! using BlockScan = cub::BlockScan<int, 128>;
898
+ //!
899
+ //! // Allocate shared memory for BlockScan
900
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
901
+ //!
902
+ //! // Obtain a segment of consecutive items that are blocked across threads
903
+ //! int thread_data[4];
904
+ //! ...
905
+ //!
906
+ //! // Collectively compute the block-wide exclusive prefix max scan
907
+ //! int block_aggregate;
908
+ //! BlockScan(temp_storage).ExclusiveScan(
909
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
910
+ //!
911
+ //! Suppose the set of input ``thread_data`` across the block of threads is
912
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
913
+ //! The corresponding output ``thread_data`` in those threads will be
914
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
915
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
916
+ //!
917
+ //! .. note::
918
+ //!
919
+ //! ``initial_value`` is not applied to the block-wide aggregate.
920
+ //!
921
+ //! @endrst
922
+ //!
923
+ //! @tparam ITEMS_PER_THREAD
924
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
925
+ //!
926
+ //! @tparam ScanOp
927
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
928
+ //!
929
+ //! @param[in] input
930
+ //! Calling thread's input items
931
+ //!
932
+ //! @param[out] output
933
+ //! Calling thread's output items (may be aliased to `input`)
934
+ //!
935
+ //! @param[in] initial_value
936
+ //! @rst
937
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
938
+ //! into account for ``block_aggregate``.
939
+ //! @endrst
940
+ //!
941
+ //! @param[in] scan_op
942
+ //! Binary scan functor
943
+ //!
944
+ //! @param[out] block_aggregate
945
+ //! block-wide aggregate reduction of input items
946
+ template <int ITEMS_PER_THREAD, typename ScanOp>
947
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
948
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
949
+ {
950
+ // Reduce consecutive thread items in registers
951
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
952
+
953
+ // Exclusive thread block-scan
954
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
955
+
956
+ // Exclusive scan in registers with prefix as seed
957
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
958
+ }
959
+
960
+ //! @rst
961
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
962
+ //! Each thread contributes an array of consecutive input elements.
963
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
964
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
965
+ //! block's scan inputs.
966
+ //!
967
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
968
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the
969
+ //! first warp of threads in the block, however only the return value from
970
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
971
+ //! - Supports non-commutative scan operators.
972
+ //! - @blocked
973
+ //! - @granularity
974
+ //! - @smemreuse
975
+ //!
976
+ //! Snippet
977
+ //! +++++++
978
+ //!
979
+ //! The code snippet below illustrates a single thread block that progressively
980
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
981
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
982
+ //! of 128 integer items that are partitioned across 128 threads.
983
+ //!
984
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
985
+ //! :language: c++
986
+ //! :dedent:
987
+ //! :start-after: example-begin block-prefix-callback-max-op
988
+ //! :end-before: example-end block-prefix-callback-max-op
989
+ //!
990
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
991
+ //! :language: c++
992
+ //! :dedent:
993
+ //! :start-after: example-begin exclusive-scan-prefix-callback
994
+ //! :end-before: example-end exclusive-scan-prefix-callback
995
+ //!
996
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
997
+ //! The corresponding output for the first segment will be
998
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
999
+ //! The output for the second segment will be
1000
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1001
+ //!
1002
+ //! @endrst
1003
+ //!
1004
+ //! @tparam ITEMS_PER_THREAD
1005
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1006
+ //!
1007
+ //! @tparam ScanOp
1008
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1009
+ //!
1010
+ //! @tparam BlockPrefixCallbackOp
1011
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1012
+ //!
1013
+ //! @param[in] input
1014
+ //! Calling thread's input items
1015
+ //!
1016
+ //! @param[out] output
1017
+ //! Calling thread's output items (may be aliased to `input`)
1018
+ //!
1019
+ //! @param[in] scan_op
1020
+ //! Binary scan functor
1021
+ //!
1022
+ //! @param[in,out] block_prefix_callback_op
1023
+ //! @rst
1024
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1025
+ //! the logical input sequence.
1026
+ //! @endrst
1027
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1028
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1029
+ T (&input)[ITEMS_PER_THREAD],
1030
+ T (&output)[ITEMS_PER_THREAD],
1031
+ ScanOp scan_op,
1032
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1033
+ {
1034
+ // Reduce consecutive thread items in registers
1035
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1036
+
1037
+ // Exclusive thread block-scan
1038
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1039
+
1040
+ // Exclusive scan in registers with prefix as seed
1041
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1042
+ }
1043
+
1044
+ //! @} end member group
1045
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1046
+
1047
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1048
+ //! @{
1049
+
1050
+ //! @rst
1051
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1052
+ //! Each thread contributes one input element.
1053
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1054
+ //!
1055
+ //! - Supports non-commutative scan operators.
1056
+ //! - @rowmajor
1057
+ //! - @smemreuse
1058
+ //!
1059
+ //! @endrst
1060
+ //!
1061
+ //! @tparam ScanOp
1062
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1063
+ //!
1064
+ //! @param[in] input
1065
+ //! Calling thread's input item
1066
+ //!
1067
+ //! @param[out] output
1068
+ //! Calling thread's output item (may be aliased to `input`)
1069
+ //!
1070
+ //! @param[in] scan_op
1071
+ //! Binary scan functor
1072
+ template <typename ScanOp>
1073
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1074
+ {
1075
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1076
+ }
1077
+
1078
+ //! @rst
1079
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1080
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1081
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1082
+ //! *thread*\ :sub:`0` is undefined.
1083
+ //!
1084
+ //! - Supports non-commutative scan operators.
1085
+ //! - @rowmajor
1086
+ //! - @smemreuse
1087
+ //!
1088
+ //! @endrst
1089
+ //!
1090
+ //! @tparam ScanOp
1091
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1092
+ //!
1093
+ //! @param[in] input
1094
+ //! Calling thread's input item
1095
+ //!
1096
+ //! @param[out] output
1097
+ //! Calling thread's output item (may be aliased to `input`)
1098
+ //!
1099
+ //! @param[in] scan_op
1100
+ //! Binary scan functor
1101
+ //!
1102
+ //! @param[out] block_aggregate
1103
+ //! block-wide aggregate reduction of input items
1104
+ template <typename ScanOp>
1105
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1106
+ {
1107
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1108
+ }
1109
+
1110
+ //! @} end member group // Exclusive prefix scans (no initial value, single datum per thread)
1111
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1112
+ //! @{
1113
+
1114
+ //! @rst
1115
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1116
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1117
+ //! output computed for *thread*\ :sub:`0` is undefined.
1118
+ //!
1119
+ //! - Supports non-commutative scan operators.
1120
+ //! - @blocked
1121
+ //! - @granularity
1122
+ //! - @smemreuse
1123
+ //!
1124
+ //! @endrst
1125
+ //!
1126
+ //! @tparam ITEMS_PER_THREAD
1127
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1128
+ //!
1129
+ //! @tparam ScanOp
1130
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1131
+ //!
1132
+ //! @param[in] input
1133
+ //! Calling thread's input items
1134
+ //!
1135
+ //! @param[out] output
1136
+ //! Calling thread's output items (may be aliased to `input`)
1137
+ //!
1138
+ //! @param[in] scan_op
1139
+ //! Binary scan functor
1140
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1141
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1142
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1143
+ {
1144
+ // Reduce consecutive thread items in registers
1145
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1146
+
1147
+ // Exclusive thread block-scan
1148
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1149
+
1150
+ // Exclusive scan in registers with prefix
1151
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1152
+ }
1153
+
1154
+ //! @rst
1155
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1156
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1157
+ //! with the block-wide ``block_aggregate`` of all inputs.
1158
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1159
+ //!
1160
+ //! - Supports non-commutative scan operators.
1161
+ //! - @blocked
1162
+ //! - @granularity
1163
+ //! - @smemreuse
1164
+ //!
1165
+ //! @endrst
1166
+ //!
1167
+ //! @tparam ITEMS_PER_THREAD
1168
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1169
+ //!
1170
+ //! @tparam ScanOp
1171
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1172
+ //!
1173
+ //! @param[in] input
1174
+ //! Calling thread's input items
1175
+ //!
1176
+ //! @param[out] output
1177
+ //! Calling thread's output items (may be aliased to `input`)
1178
+ //!
1179
+ //! @param[in] scan_op
1180
+ //! Binary scan functor
1181
+ //!
1182
+ //! @param[out] block_aggregate
1183
+ //! block-wide aggregate reduction of input items
1184
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1185
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1186
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1187
+ {
1188
+ // Reduce consecutive thread items in registers
1189
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1190
+
1191
+ // Exclusive thread block-scan
1192
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1193
+
1194
+ // Exclusive scan in registers with prefix
1195
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1196
+ }
1197
+
1198
+ //! @} end member group // Exclusive prefix scans (no initial value, multiple data per thread)
1199
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1200
+
1201
+ //! @name Inclusive prefix sum operations
1202
+ //! @{
1203
+
1204
+ //! @rst
1205
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1206
+ //! as the scan operator. Each thread contributes one input element.
1207
+ //!
1208
+ //! - @rowmajor
1209
+ //! - @smemreuse
1210
+ //!
1211
+ //! Snippet
1212
+ //! +++++++
1213
+ //!
1214
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1215
+ //! are partitioned across 128 threads.
1216
+ //!
1217
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1218
+ //! :language: c++
1219
+ //! :dedent:
1220
+ //! :start-after: example-begin inclusive-sum-single
1221
+ //! :end-before: example-end inclusive-sum-single
1222
+ //!
1223
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1224
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1225
+ //!
1226
+ //! @endrst
1227
+ //!
1228
+ //! @param[in] input
1229
+ //! Calling thread's input item
1230
+ //!
1231
+ //! @param[out] output
1232
+ //! Calling thread's output item (may be aliased to `input`)
1233
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1234
+ {
1235
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1236
+ }
1237
+
1238
+ //! @rst
1239
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1240
+ //! Each thread contributes one input element.
1241
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1242
+ //!
1243
+ //! - @rowmajor
1244
+ //! - @smemreuse
1245
+ //!
1246
+ //! Snippet
1247
+ //! +++++++
1248
+ //!
1249
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1250
+ //! are partitioned across 128 threads.
1251
+ //!
1252
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1253
+ //! :language: c++
1254
+ //! :dedent:
1255
+ //! :start-after: example-begin inclusive-sum-single-aggregate
1256
+ //! :end-before: example-end inclusive-sum-single-aggregate
1257
+ //!
1258
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1259
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1260
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1261
+ //!
1262
+ //! @endrst
1263
+ //!
1264
+ //! @param[in] input
1265
+ //! Calling thread's input item
1266
+ //!
1267
+ //! @param[out] output
1268
+ //! Calling thread's output item (may be aliased to `input`)
1269
+ //!
1270
+ //! @param[out] block_aggregate
1271
+ //! block-wide aggregate reduction of input items
1272
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1273
+ {
1274
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1275
+ }
1276
+
1277
+ //! @rst
1278
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1279
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1280
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1281
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1282
+ //! scan inputs.
1283
+ //!
1284
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1285
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1286
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1287
+ //! - @rowmajor
1288
+ //! - @smemreuse
1289
+ //!
1290
+ //! Snippet
1291
+ //! +++++++
1292
+ //!
1293
+ //! The code snippet below illustrates a single thread block that progressively
1294
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1295
+ //! prefix functor to maintain a running total between block-wide scans.
1296
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1297
+ //!
1298
+ //! .. code-block:: c++
1299
+ //!
1300
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1301
+ //!
1302
+ //! // A stateful callback functor that maintains a running prefix to be applied
1303
+ //! // during consecutive scan operations.
1304
+ //! struct BlockPrefixCallbackOp
1305
+ //! {
1306
+ //! // Running prefix
1307
+ //! int running_total;
1308
+ //!
1309
+ //! // Constructor
1310
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1311
+ //!
1312
+ //! // Callback operator to be entered by the first warp of threads in the block.
1313
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1314
+ //! __device__ int operator()(int block_aggregate)
1315
+ //! {
1316
+ //! int old_prefix = running_total;
1317
+ //! running_total += block_aggregate;
1318
+ //! return old_prefix;
1319
+ //! }
1320
+ //! };
1321
+ //!
1322
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1323
+ //! {
1324
+ //! // Specialize BlockScan for a 1D block of 128 threads
1325
+ //! using BlockScan = cub::BlockScan<int, 128>;
1326
+ //!
1327
+ //! // Allocate shared memory for BlockScan
1328
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1329
+ //!
1330
+ //! // Initialize running total
1331
+ //! BlockPrefixCallbackOp prefix_op(0);
1332
+ //!
1333
+ //! // Have the block iterate over segments of items
1334
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1335
+ //! {
1336
+ //! // Load a segment of consecutive items that are blocked across threads
1337
+ //! int thread_data = d_data[block_offset + threadIdx.x];
1338
+ //!
1339
+ //! // Collectively compute the block-wide inclusive prefix sum
1340
+ //! BlockScan(temp_storage).InclusiveSum(
1341
+ //! thread_data, thread_data, prefix_op);
1342
+ //! __syncthreads();
1343
+ //!
1344
+ //! // Store scanned items to output segment
1345
+ //! d_data[block_offset + threadIdx.x] = thread_data;
1346
+ //! }
1347
+ //!
1348
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1349
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1350
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1351
+ //!
1352
+ //! @endrst
1353
+ //!
1354
+ //! @tparam BlockPrefixCallbackOp
1355
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1356
+ //!
1357
+ //! @param[in] input
1358
+ //! Calling thread's input item
1359
+ //!
1360
+ //! @param[out] output
1361
+ //! Calling thread's output item (may be aliased to `input`)
1362
+ //!
1363
+ //! @param[in,out] block_prefix_callback_op
1364
+ //! @rst
1365
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1366
+ //! to the logical input sequence.
1367
+ //! @endrst
1368
+ template <typename BlockPrefixCallbackOp>
1369
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1370
+ {
1371
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1372
+ }
1373
+
1374
+ //! @} end member group
1375
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1376
+ //! @{
1377
+
1378
+ //! @rst
1379
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1380
+ //! Each thread contributes an array of consecutive input elements.
1381
+ //!
1382
+ //! - @blocked
1383
+ //! - @granularity
1384
+ //! - @smemreuse
1385
+ //!
1386
+ //! Snippet
1387
+ //! +++++++
1388
+ //!
1389
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1390
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1391
+ //! where each thread owns 4 consecutive items.
1392
+ //!
1393
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1394
+ //! :language: c++
1395
+ //! :dedent:
1396
+ //! :start-after: example-begin inclusive-sum-array
1397
+ //! :end-before: example-end inclusive-sum-array
1398
+ //!
1399
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1400
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1401
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1402
+ //!
1403
+ //! @endrst
1404
+ //!
1405
+ //! @tparam ITEMS_PER_THREAD
1406
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1407
+ //!
1408
+ //! @param[in] input
1409
+ //! Calling thread's input items
1410
+ //!
1411
+ //! @param[out] output
1412
+ //! Calling thread's output items (may be aliased to `input`)
1413
+ template <int ITEMS_PER_THREAD>
1414
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1415
+ {
1416
+ if constexpr (ITEMS_PER_THREAD == 1)
1417
+ {
1418
+ InclusiveSum(input[0], output[0]);
1419
+ }
1420
+ else
1421
+ {
1422
+ // Reduce consecutive thread items in registers
1423
+ ::cuda::std::plus<> scan_op;
1424
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1425
+
1426
+ // Exclusive thread block-scan
1427
+ ExclusiveSum(thread_prefix, thread_prefix);
1428
+
1429
+ // Inclusive scan in registers with prefix as seed
1430
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1431
+ }
1432
+ }
1433
+
1434
+ //! @rst
1435
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1436
+ //! Each thread contributes an array of consecutive input elements.
1437
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1438
+ //!
1439
+ //! - @blocked
1440
+ //! - @granularity
1441
+ //! - @smemreuse
1442
+ //!
1443
+ //! Snippet
1444
+ //! +++++++
1445
+ //!
1446
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1447
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1448
+ //! where each thread owns 4 consecutive items.
1449
+ //!
1450
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1451
+ //! :language: c++
1452
+ //! :dedent:
1453
+ //! :start-after: example-begin inclusive-sum-array-aggregate
1454
+ //! :end-before: example-end inclusive-sum-array-aggregate
1455
+ //!
1456
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1457
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1458
+ //! corresponding output ``thread_data`` in those threads will be
1459
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1460
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1461
+ //!
1462
+ //! @endrst
1463
+ //!
1464
+ //! @tparam ITEMS_PER_THREAD
1465
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1466
+ //!
1467
+ //! @param[in] input
1468
+ //! Calling thread's input items
1469
+ //!
1470
+ //! @param[out] output
1471
+ //! Calling thread's output items (may be aliased to `input`)
1472
+ //!
1473
+ //! @param[out] block_aggregate
1474
+ //! block-wide aggregate reduction of input items
1475
+ template <int ITEMS_PER_THREAD>
1476
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1477
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1478
+ {
1479
+ if constexpr (ITEMS_PER_THREAD == 1)
1480
+ {
1481
+ InclusiveSum(input[0], output[0], block_aggregate);
1482
+ }
1483
+ else
1484
+ {
1485
+ // Reduce consecutive thread items in registers
1486
+ ::cuda::std::plus<> scan_op;
1487
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1488
+
1489
+ // Exclusive thread block-scan
1490
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1491
+
1492
+ // Inclusive scan in registers with prefix as seed
1493
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1494
+ }
1495
+ }
1496
+
1497
+ //! @rst
1498
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1499
+ //! Each thread contributes an array of consecutive input elements.
1500
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1501
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1502
+ //! value that logically prefixes the thread block's scan inputs.
1503
+ //!
1504
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1505
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1506
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1507
+ //! - @blocked
1508
+ //! - @granularity
1509
+ //! - @smemreuse
1510
+ //!
1511
+ //! Snippet
1512
+ //! +++++++
1513
+ //!
1514
+ //! The code snippet below illustrates a single thread block that progressively
1515
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1516
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1517
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1518
+ //! across 128 threads where each thread owns 4 consecutive items.
1519
+ //!
1520
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1521
+ //! :language: c++
1522
+ //! :dedent:
1523
+ //! :start-after: example-begin block-prefix-callback-op
1524
+ //! :end-before: example-end block-prefix-callback-op
1525
+ //!
1526
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1527
+ //! :language: c++
1528
+ //! :dedent:
1529
+ //! :start-after: example-begin inclusive-scan-prefix-callback
1530
+ //! :end-before: example-end inclusive-scan-prefix-callback
1531
+ //!
1532
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1533
+ //! The corresponding output for the first segment will be
1534
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1535
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1536
+ //!
1537
+ //! @endrst
1538
+ //!
1539
+ //! @tparam ITEMS_PER_THREAD
1540
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1541
+ //!
1542
+ //! @tparam BlockPrefixCallbackOp
1543
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1544
+ //!
1545
+ //! @param[in] input
1546
+ //! Calling thread's input items
1547
+ //!
1548
+ //! @param[out] output
1549
+ //! Calling thread's output items (may be aliased to `input`)
1550
+ //!
1551
+ //! @param[in,out] block_prefix_callback_op
1552
+ //! @rst
1553
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1554
+ //! logical input sequence.
1555
+ //! @endrst
1556
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1557
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1558
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1559
+ {
1560
+ if constexpr (ITEMS_PER_THREAD == 1)
1561
+ {
1562
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1563
+ }
1564
+ else
1565
+ {
1566
+ // Reduce consecutive thread items in registers
1567
+ ::cuda::std::plus<> scan_op;
1568
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1569
+
1570
+ // Exclusive thread block-scan
1571
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1572
+
1573
+ // Inclusive scan in registers with prefix as seed
1574
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1575
+ }
1576
+ }
1577
+
1578
+ //! @} end member group
1579
+ //! @name Inclusive prefix scan operations
1580
+ //! @{
1581
+
1582
+ //! @rst
1583
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1584
+ //! Each thread contributes one input element.
1585
+ //!
1586
+ //! - Supports non-commutative scan operators.
1587
+ //! - @rowmajor
1588
+ //! - @smemreuse
1589
+ //!
1590
+ //! Snippet
1591
+ //! +++++++
1592
+ //!
1593
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1594
+ //! are partitioned across 128 threads.
1595
+ //!
1596
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1597
+ //! :language: c++
1598
+ //! :dedent:
1599
+ //! :start-after: example-begin inclusive-scan-single
1600
+ //! :end-before: example-end inclusive-scan-single
1601
+ //!
1602
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1603
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1604
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1605
+ //!
1606
+ //! @endrst
1607
+ //!
1608
+ //! @tparam ScanOp
1609
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1610
+ //!
1611
+ //! @param[in] input
1612
+ //! Calling thread's input item
1613
+ //!
1614
+ //! @param[out] output
1615
+ //! Calling thread's output item (may be aliased to `input`)
1616
+ //!
1617
+ //! @param[in] scan_op
1618
+ //! Binary scan functor
1619
+ template <typename ScanOp>
1620
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1621
+ {
1622
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1623
+ }
1624
+
1625
+ //! @rst
1626
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1627
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1628
+ //! ``block_aggregate`` of all inputs.
1629
+ //!
1630
+ //! - Supports non-commutative scan operators.
1631
+ //! - @rowmajor
1632
+ //! - @smemreuse
1633
+ //!
1634
+ //! Snippet
1635
+ //! +++++++
1636
+ //!
1637
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
1638
+ //! integer items that are partitioned across 128 threads.
1639
+ //!
1640
+ //! .. code-block:: c++
1641
+ //!
1642
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1643
+ //!
1644
+ //! __global__ void ExampleKernel(...)
1645
+ //! {
1646
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1647
+ //! using BlockScan = cub::BlockScan<int, 128>;
1648
+ //!
1649
+ //! // Allocate shared memory for BlockScan
1650
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1651
+ //!
1652
+ //! // Obtain input item for each thread
1653
+ //! int thread_data;
1654
+ //! ...
1655
+ //!
1656
+ //! // Collectively compute the block-wide inclusive prefix max scan
1657
+ //! int block_aggregate;
1658
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
1659
+ //!
1660
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1661
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1662
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
1663
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
1664
+ //!
1665
+ //! @endrst
1666
+ //!
1667
+ //! @tparam ScanOp
1668
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1669
+ //!
1670
+ //! @param[in] input
1671
+ //! Calling thread's input item
1672
+ //!
1673
+ //! @param[out] output
1674
+ //! Calling thread's output item (may be aliased to `input`)
1675
+ //!
1676
+ //! @param[in] scan_op
1677
+ //! Binary scan functor
1678
+ //!
1679
+ //! @param[out] block_aggregate
1680
+ //! Block-wide aggregate reduction of input items
1681
+ template <typename ScanOp>
1682
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1683
+ {
1684
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1685
+ }
1686
+
1687
+ //! @rst
1688
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1689
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
1690
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
1691
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
1692
+ //!
1693
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1694
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
1695
+ //! The functor will be invoked by the first warp of threads in the block,
1696
+ //! however only the return value from *lane*\ :sub:`0` is applied
1697
+ //! as the block-wide prefix. Can be stateful.
1698
+ //! - Supports non-commutative scan operators.
1699
+ //! - @rowmajor
1700
+ //! - @smemreuse
1701
+ //!
1702
+ //! Snippet
1703
+ //! +++++++
1704
+ //!
1705
+ //! The code snippet below illustrates a single thread block that progressively
1706
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
1707
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1708
+ //! of 128 integer items that are partitioned across 128 threads.
1709
+ //!
1710
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1711
+ //! :language: c++
1712
+ //! :dedent:
1713
+ //! :start-after: example-begin block-prefix-callback-max-op
1714
+ //! :end-before: example-end block-prefix-callback-max-op
1715
+ //!
1716
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1717
+ //! :language: c++
1718
+ //! :dedent:
1719
+ //! :start-after: example-begin inclusive-scan-prefix-callback-max
1720
+ //! :end-before: example-end inclusive-scan-prefix-callback-max
1721
+ //!
1722
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1723
+ //! The corresponding output for the first segment will be
1724
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
1725
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
1726
+ //!
1727
+ //! @endrst
1728
+ //!
1729
+ //! @tparam ScanOp
1730
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1731
+ //!
1732
+ //! @tparam BlockPrefixCallbackOp
1733
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1734
+ //!
1735
+ //! @param[in] input
1736
+ //! Calling thread's input item
1737
+ //!
1738
+ //! @param[out] output
1739
+ //! Calling thread's output item (may be aliased to `input`)
1740
+ //!
1741
+ //! @param[in] scan_op
1742
+ //! Binary scan functor
1743
+ //!
1744
+ //! @param[in,out] block_prefix_callback_op
1745
+ //! @rst
1746
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1747
+ //! the logical input sequence.
1748
+ //! @endrst
1749
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
1750
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1751
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
1752
+ {
1753
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
1754
+ }
1755
+
1756
+ //! @} end member group
1757
+ //! @name Inclusive prefix scan operations (multiple data per thread)
1758
+ //! @{
1759
+
1760
+ //! @rst
1761
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1762
+ //! Each thread contributes an array of consecutive input elements.
1763
+ //!
1764
+ //! - Supports non-commutative scan operators.
1765
+ //! - @blocked
1766
+ //! - @granularity
1767
+ //! - @smemreuse
1768
+ //!
1769
+ //! Snippet
1770
+ //! +++++++
1771
+ //!
1772
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
1773
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
1774
+ //! where each thread owns 4 consecutive items.
1775
+ //!
1776
+ //! .. literalinclude:: ../../examples/block/example_block_scan.cu
1777
+ //! :language: c++
1778
+ //! :dedent:
1779
+ //! :start-after: example-begin inclusive-scan-array
1780
+ //! :end-before: example-end inclusive-scan-array
1781
+ //!
1782
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1783
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1784
+ //! The corresponding output ``thread_data`` in those threads will be
1785
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
1786
+ //!
1787
+ //! @endrst
1788
+ //!
1789
+ //! @tparam ITEMS_PER_THREAD
1790
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1791
+ //!
1792
+ //! @tparam ScanOp
1793
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1794
+ //!
1795
+ //! @param[in] input
1796
+ //! Calling thread's input items
1797
+ //!
1798
+ //! @param[out] output
1799
+ //! Calling thread's output items (may be aliased to `input`)
1800
+ //!
1801
+ //! @param[in] scan_op
1802
+ //! Binary scan functor
1803
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1804
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1805
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1806
+ {
1807
+ if constexpr (ITEMS_PER_THREAD == 1)
1808
+ {
1809
+ InclusiveScan(input[0], output[0], scan_op);
1810
+ }
1811
+ else
1812
+ {
1813
+ // Reduce consecutive thread items in registers
1814
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1815
+
1816
+ // Exclusive thread block-scan
1817
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
1818
+
1819
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
1820
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1821
+ }
1822
+ }
1823
+
1824
+ //! @rst
1825
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1826
+ //! Each thread contributes an array of consecutive input elements.
1827
+ //!
1828
+ //! - Supports non-commutative scan operators.
1829
+ //! - @blocked
1830
+ //! - @granularity
1831
+ //! - @smemreuse
1832
+ //!
1833
+ //! Snippet
1834
+ //! +++++++
1835
+ //!
1836
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1837
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
1838
+ //! where each thread owns 2 consecutive items.
1839
+ //!
1840
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
1841
+ //! :language: c++
1842
+ //! :dedent:
1843
+ //! :start-after: example-begin inclusive-scan-array-init-value
1844
+ //! :end-before: example-end inclusive-scan-array-init-value
1845
+ //!
1846
+ //!
1847
+ //! @endrst
1848
+ //!
1849
+ //! @tparam ITEMS_PER_THREAD
1850
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1851
+ //!
1852
+ //! @tparam ScanOp
1853
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1854
+ //!
1855
+ //! @param[in] input
1856
+ //! Calling thread's input items
1857
+ //!
1858
+ //! @param[out] output
1859
+ //! Calling thread's output items (may be aliased to `input`)
1860
+ //!
1861
+ //! @param[in] initial_value
1862
+ //! Initial value to seed the inclusive scan (uniform across block)
1863
+ //!
1864
+ //! @param[in] scan_op
1865
+ //! Binary scan functor
1866
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1867
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1868
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
1869
+ {
1870
+ // Reduce consecutive thread items in registers
1871
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1872
+
1873
+ // Exclusive thread block-scan
1874
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
1875
+
1876
+ // Exclusive scan in registers with prefix as seed
1877
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1878
+ }
1879
+
1880
+ //! @rst
1881
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1882
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1883
+ //! with the block-wide ``block_aggregate`` of all inputs.
1884
+ //!
1885
+ //! - Supports non-commutative scan operators.
1886
+ //! - @blocked
1887
+ //! - @granularity
1888
+ //! - @smemreuse
1889
+ //!
1890
+ //! Snippet
1891
+ //! +++++++
1892
+ //!
1893
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
1894
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
1895
+ //! where each thread owns 4 consecutive items.
1896
+ //!
1897
+ //! .. code-block:: c++
1898
+ //!
1899
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1900
+ //!
1901
+ //! __global__ void ExampleKernel(...)
1902
+ //! {
1903
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1904
+ //! using BlockScan = cub::BlockScan<int, 128>;
1905
+ //!
1906
+ //! // Allocate shared memory for BlockScan
1907
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1908
+ //!
1909
+ //! // Obtain a segment of consecutive items that are blocked across threads
1910
+ //! int thread_data[4];
1911
+ //! ...
1912
+ //!
1913
+ //! // Collectively compute the block-wide inclusive prefix max scan
1914
+ //! int block_aggregate;
1915
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
1916
+ //!
1917
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1918
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1919
+ //! The corresponding output ``thread_data`` in those threads will be
1920
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
1921
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
1922
+ //!
1923
+ //! @endrst
1924
+ //!
1925
+ //! @tparam ITEMS_PER_THREAD
1926
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1927
+ //!
1928
+ //! @tparam ScanOp
1929
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1930
+ //!
1931
+ //! @param[in] input
1932
+ //! Calling thread's input items
1933
+ //!
1934
+ //! @param[out] output
1935
+ //! Calling thread's output items (may be aliased to `input`)
1936
+ //!
1937
+ //! @param[in] scan_op
1938
+ //! Binary scan functor
1939
+ //!
1940
+ //! @param[out] block_aggregate
1941
+ //! Block-wide aggregate reduction of input items
1942
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1943
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1944
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1945
+ {
1946
+ if (ITEMS_PER_THREAD == 1)
1947
+ {
1948
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
1949
+ }
1950
+ else
1951
+ {
1952
+ // Reduce consecutive thread items in registers
1953
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1954
+
1955
+ // Exclusive thread block-scan (with no initial value)
1956
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
1957
+
1958
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
1959
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1960
+ }
1961
+ }
1962
+
1963
+ //! @rst
1964
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1965
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1966
+ //! with the block-wide ``block_aggregate`` of all inputs.
1967
+ //!
1968
+ //! - Supports non-commutative scan operators.
1969
+ //! - @blocked
1970
+ //! - @granularity
1971
+ //! - @smemreuse
1972
+ //!
1973
+ //! Snippet
1974
+ //! +++++++
1975
+ //!
1976
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1977
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
1978
+ //! where each thread owns 2 consecutive items.
1979
+ //!
1980
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
1981
+ //! :language: c++
1982
+ //! :dedent:
1983
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
1984
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
1985
+ //!
1986
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
1987
+ //!
1988
+ //! .. note::
1989
+ //!
1990
+ //! ``initial_value`` is not applied to the block-wide aggregate.
1991
+ //!
1992
+ //! @endrst
1993
+ //!
1994
+ //! @tparam ITEMS_PER_THREAD
1995
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1996
+ //!
1997
+ //! @tparam ScanOp
1998
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1999
+ //!
2000
+ //! @param[in] input
2001
+ //! Calling thread's input items
2002
+ //!
2003
+ //! @param[out] output
2004
+ //! Calling thread's output items (may be aliased to `input`)
2005
+ //!
2006
+ //! @param[in] initial_value
2007
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2008
+ //! into account for ``block_aggregate``.
2009
+ //!
2010
+ //! @param[in] scan_op
2011
+ //! Binary scan functor
2012
+ //!
2013
+ //! @param[out] block_aggregate
2014
+ //! Block-wide aggregate reduction of input items
2015
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2016
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2017
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2018
+ {
2019
+ // Reduce consecutive thread items in registers
2020
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2021
+
2022
+ // Exclusive thread block-scan
2023
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2024
+
2025
+ // Exclusive scan in registers with prefix as seed
2026
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2027
+ }
2028
+
2029
+ //! @rst
2030
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2031
+ //! Each thread contributes an array of consecutive input elements.
2032
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2033
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2034
+ //! thread block's scan inputs.
2035
+ //!
2036
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2037
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2038
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2039
+ //! - Supports non-commutative scan operators.
2040
+ //! - @blocked
2041
+ //! - @granularity
2042
+ //! - @smemreuse
2043
+ //!
2044
+ //! Snippet
2045
+ //! +++++++
2046
+ //!
2047
+ //! The code snippet below illustrates a single thread block that progressively
2048
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2049
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2050
+ //! of 128 integer items that are partitioned across 128 threads.
2051
+ //!
2052
+ //! .. code-block:: c++
2053
+ //!
2054
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2055
+ //!
2056
+ //! // A stateful callback functor that maintains a running prefix to be applied
2057
+ //! // during consecutive scan operations.
2058
+ //! struct BlockPrefixCallbackOp
2059
+ //! {
2060
+ //! // Running prefix
2061
+ //! int running_total;
2062
+ //!
2063
+ //! // Constructor
2064
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2065
+ //!
2066
+ //! // Callback operator to be entered by the first warp of threads in the block.
2067
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2068
+ //! __device__ int operator()(int block_aggregate)
2069
+ //! {
2070
+ //! int old_prefix = running_total;
2071
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2072
+ //! return old_prefix;
2073
+ //! }
2074
+ //! };
2075
+ //!
2076
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2077
+ //! {
2078
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2079
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2080
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2081
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2082
+ //!
2083
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2084
+ //! __shared__ union {
2085
+ //! typename BlockLoad::TempStorage load;
2086
+ //! typename BlockScan::TempStorage scan;
2087
+ //! typename BlockStore::TempStorage store;
2088
+ //! } temp_storage;
2089
+ //!
2090
+ //! // Initialize running total
2091
+ //! BlockPrefixCallbackOp prefix_op(0);
2092
+ //!
2093
+ //! // Have the block iterate over segments of items
2094
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2095
+ //! {
2096
+ //! // Load a segment of consecutive items that are blocked across threads
2097
+ //! int thread_data[4];
2098
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2099
+ //! __syncthreads();
2100
+ //!
2101
+ //! // Collectively compute the block-wide inclusive prefix max scan
2102
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2103
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2104
+ //! __syncthreads();
2105
+ //!
2106
+ //! // Store scanned items to output segment
2107
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2108
+ //! __syncthreads();
2109
+ //! }
2110
+ //!
2111
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2112
+ //! The corresponding output for the first segment will be
2113
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2114
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2115
+ //!
2116
+ //! @endrst
2117
+ //!
2118
+ //! @tparam ITEMS_PER_THREAD
2119
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2120
+ //!
2121
+ //! @tparam ScanOp
2122
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2123
+ //!
2124
+ //! @tparam BlockPrefixCallbackOp
2125
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2126
+ //!
2127
+ //! @param[in] input
2128
+ //! Calling thread's input items
2129
+ //!
2130
+ //! @param[out] output
2131
+ //! Calling thread's output items (may be aliased to `input`)
2132
+ //!
2133
+ //! @param[in] scan_op
2134
+ //! Binary scan functor
2135
+ //!
2136
+ //! @param[in,out] block_prefix_callback_op
2137
+ //! @rst
2138
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2139
+ //! the logical input sequence.
2140
+ //! @endrst
2141
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2142
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2143
+ T (&input)[ITEMS_PER_THREAD],
2144
+ T (&output)[ITEMS_PER_THREAD],
2145
+ ScanOp scan_op,
2146
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2147
+ {
2148
+ if (ITEMS_PER_THREAD == 1)
2149
+ {
2150
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2151
+ }
2152
+ else
2153
+ {
2154
+ // Reduce consecutive thread items in registers
2155
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2156
+
2157
+ // Exclusive thread block-scan
2158
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2159
+
2160
+ // Inclusive scan in registers with prefix as seed
2161
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2162
+ }
2163
+ }
2164
+
2165
+ //! @} end member group
2166
+ };
2167
+
2168
+ CUB_NAMESPACE_END