cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1962) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +275 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1185 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +927 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +232 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +289 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +706 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +558 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1127 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +585 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +477 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1120 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +609 -0
  43. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  44. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  45. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  46. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  47. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  48. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1308 -0
  49. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  50. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  51. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1220 -0
  53. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2194 -0
  54. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  55. cuda/cccl/headers/include/cub/block/block_reduce.cuh +666 -0
  56. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  57. cuda/cccl/headers/include/cub/block/block_scan.cuh +2584 -0
  58. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  59. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  60. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  67. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  68. cuda/cccl/headers/include/cub/config.cuh +53 -0
  69. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  70. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  71. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  72. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  73. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  74. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  75. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  76. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  77. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  78. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  79. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  85. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  86. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  87. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  88. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  89. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  90. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  91. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  92. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  93. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  94. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  95. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  96. cuda/cccl/headers/include/cub/device/device_for.cuh +985 -0
  97. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  98. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  99. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  100. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  101. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  102. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  103. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2519 -0
  104. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  105. cuda/cccl/headers/include/cub/device/device_scan.cuh +2205 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  107. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1520 -0
  108. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  109. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  110. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  111. cuda/cccl/headers/include/cub/device/device_transform.cuh +637 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +111 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +304 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +474 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1753 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1327 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +536 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +314 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +500 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +917 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +342 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +561 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +226 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +578 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +192 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +324 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1009 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +79 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +676 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +621 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  160. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  161. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +443 -0
  162. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  163. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +454 -0
  164. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  165. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  166. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  167. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  168. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  169. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  170. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  171. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  172. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  173. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  174. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +541 -0
  175. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  176. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  177. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  178. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  179. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  180. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  181. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  182. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  183. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  184. cuda/cccl/headers/include/cub/util_device.cuh +784 -0
  185. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  186. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  187. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  188. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  189. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  190. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  191. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  192. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  193. cuda/cccl/headers/include/cub/version.cuh +89 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  195. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  196. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +736 -0
  197. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +407 -0
  198. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  199. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  200. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  201. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  202. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  203. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +824 -0
  204. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1886 -0
  205. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  206. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  207. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  208. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  209. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  212. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  213. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  214. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  215. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  216. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  217. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  218. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  220. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  221. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +468 -0
  222. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  223. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  224. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  225. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  226. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  227. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  228. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  229. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  230. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  231. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  232. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  233. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  234. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  235. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  236. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  237. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  238. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  239. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  240. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  241. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  242. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  243. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  244. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  245. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +185 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +541 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  254. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  255. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  256. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  257. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  258. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  259. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  260. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  261. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  262. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  263. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  264. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  265. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  266. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  267. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +300 -0
  268. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  269. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  270. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  271. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  272. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +386 -0
  273. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +344 -0
  274. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +498 -0
  275. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +501 -0
  276. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +461 -0
  277. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  278. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +673 -0
  279. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  280. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  281. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  282. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  283. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  286. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  287. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  288. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  289. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  290. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  291. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  292. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  293. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  294. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  295. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  296. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  297. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  298. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  299. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  300. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  301. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  302. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  303. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  304. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  308. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  309. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  310. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  311. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  313. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  424. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  425. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  426. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  427. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  428. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  429. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  430. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  431. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  432. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  433. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  455. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  456. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  457. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  458. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  459. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  460. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  461. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  462. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  463. cuda/cccl/headers/include/cuda/access_property +26 -0
  464. cuda/cccl/headers/include/cuda/algorithm +27 -0
  465. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  466. cuda/cccl/headers/include/cuda/atomic +27 -0
  467. cuda/cccl/headers/include/cuda/barrier +267 -0
  468. cuda/cccl/headers/include/cuda/bit +29 -0
  469. cuda/cccl/headers/include/cuda/cmath +36 -0
  470. cuda/cccl/headers/include/cuda/devices +20 -0
  471. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  472. cuda/cccl/headers/include/cuda/functional +32 -0
  473. cuda/cccl/headers/include/cuda/iterator +38 -0
  474. cuda/cccl/headers/include/cuda/latch +27 -0
  475. cuda/cccl/headers/include/cuda/mdspan +28 -0
  476. cuda/cccl/headers/include/cuda/memory +34 -0
  477. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  478. cuda/cccl/headers/include/cuda/numeric +29 -0
  479. cuda/cccl/headers/include/cuda/pipeline +579 -0
  480. cuda/cccl/headers/include/cuda/ptx +128 -0
  481. cuda/cccl/headers/include/cuda/semaphore +31 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  600. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  601. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  602. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  606. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +676 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1284 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  641. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  642. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  643. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  651. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/complex.h +674 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  719. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  722. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  723. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  724. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  725. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  726. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  727. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  728. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  729. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  730. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  731. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  732. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  734. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1956 -0
  735. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  736. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  737. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  754. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  755. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  761. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  762. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  767. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  768. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  769. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  770. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  771. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  772. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  773. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/function.h +1278 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +268 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +66 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  821. cuda/cccl/headers/include/cuda/std/__internal/features.h +77 -0
  822. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  860. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  861. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  862. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  866. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  867. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +144 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +758 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +497 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  878. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  879. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +532 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  901. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  902. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  903. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  904. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  905. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  918. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  924. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  925. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  926. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  927. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  928. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  929. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  930. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  931. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  932. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  962. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  964. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  965. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  966. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  967. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  969. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  974. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +80 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +64 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  985. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +162 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +106 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/pair.h +796 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1148. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1149. cuda/cccl/headers/include/cuda/std/array +518 -0
  1150. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1151. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1152. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1153. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1154. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1155. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1156. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1157. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1158. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1159. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1160. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1161. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1162. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1164. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1165. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1166. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +204 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1174. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1175. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1176. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1177. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1178. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1179. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1180. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1181. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1182. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1183. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1184. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1185. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1186. cuda/cccl/headers/include/cuda/std/numbers +341 -0
  1187. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1188. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1189. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1190. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1191. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1192. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1193. cuda/cccl/headers/include/cuda/std/span +628 -0
  1194. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1195. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1196. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1197. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1198. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1199. cuda/cccl/headers/include/cuda/std/version +243 -0
  1200. cuda/cccl/headers/include/cuda/stream +31 -0
  1201. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1202. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1203. cuda/cccl/headers/include/cuda/utility +27 -0
  1204. cuda/cccl/headers/include/cuda/version +16 -0
  1205. cuda/cccl/headers/include/cuda/warp +28 -0
  1206. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1207. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1208. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1209. cuda/cccl/headers/include/nv/target +235 -0
  1210. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1211. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1212. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1213. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1214. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1215. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1216. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1217. cuda/cccl/headers/include/thrust/count.h +245 -0
  1218. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1219. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1220. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1230. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1231. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1232. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1233. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1234. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1235. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1236. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1237. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1238. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1239. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1240. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1252. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1253. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1254. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1255. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1256. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1257. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1258. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1259. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1260. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1261. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1262. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1263. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1264. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1265. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1266. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1267. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1268. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1269. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1271. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1272. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1273. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1274. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1275. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1276. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1277. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1278. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1279. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1280. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1281. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1282. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1283. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1284. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1285. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1286. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1287. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1288. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1289. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1290. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1291. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1292. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1293. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1294. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1295. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1296. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1297. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1298. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1299. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1301. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1302. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1303. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1304. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1305. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1306. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1307. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1308. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1309. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1310. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1311. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1312. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1313. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1314. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1315. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1316. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1317. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1318. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1319. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1320. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1321. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1322. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1323. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1324. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1325. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1326. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1327. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1328. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1329. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1330. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1331. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1332. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1333. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1334. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1335. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1336. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1337. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1338. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1339. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1340. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1341. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1342. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1343. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1344. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1345. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1346. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1347. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1348. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1349. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1350. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1351. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1352. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1353. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1354. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1355. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1356. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1357. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1358. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1359. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1360. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1361. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1362. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1363. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1364. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1365. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1366. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1367. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1368. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1369. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1370. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1371. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1372. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1373. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1374. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1375. cuda/cccl/headers/include/thrust/find.h +382 -0
  1376. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1377. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1378. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1379. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1380. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1381. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1382. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1383. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1384. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1385. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1386. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1387. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1388. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1389. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1390. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1391. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1392. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1393. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1394. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1395. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1396. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1397. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1398. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1399. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1400. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1401. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1402. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +323 -0
  1403. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1404. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1405. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1406. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1407. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1408. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1409. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1410. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1411. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1412. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1413. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1414. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1415. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1416. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1417. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1418. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1419. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1420. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1421. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1422. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1423. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1424. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1425. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1426. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1427. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1428. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1429. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1430. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1431. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1432. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1433. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1434. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1435. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1436. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1437. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1438. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1439. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1440. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1441. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1442. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1443. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1444. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1445. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1446. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1447. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1448. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1449. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1450. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1451. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1452. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1453. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1454. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1455. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1456. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1457. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1458. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1459. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1460. cuda/cccl/headers/include/thrust/random.h +120 -0
  1461. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1462. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1463. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1464. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1465. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1466. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1467. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1468. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1469. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1470. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1471. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +240 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +470 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1772. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1838. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1902. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1903. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1904. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1905. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1906. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1907. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1908. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1909. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1910. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1911. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1912. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1913. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1914. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1915. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1916. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1917. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1918. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1919. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1920. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1921. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1922. cuda/cccl/headers/include/thrust/version.h +93 -0
  1923. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1924. cuda/cccl/headers/include_paths.py +51 -0
  1925. cuda/cccl/parallel/__init__.py +9 -0
  1926. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1927. cuda/cccl/parallel/experimental/__init__.py +73 -0
  1928. cuda/cccl/parallel/experimental/_bindings.py +79 -0
  1929. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1930. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1984 -0
  1931. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1932. cuda/cccl/parallel/experimental/_cccl_interop.py +422 -0
  1933. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1934. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1935. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1936. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1937. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1938. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1939. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1940. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1941. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1942. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1943. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1944. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1945. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1946. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  1947. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1948. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  1949. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1950. cuda/cccl/parallel/experimental/iterators/__init__.py +19 -0
  1951. cuda/cccl/parallel/experimental/iterators/_factories.py +191 -0
  1952. cuda/cccl/parallel/experimental/iterators/_iterators.py +612 -0
  1953. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +199 -0
  1954. cuda/cccl/parallel/experimental/numba_utils.py +53 -0
  1955. cuda/cccl/parallel/experimental/op.py +3 -0
  1956. cuda/cccl/parallel/experimental/struct.py +272 -0
  1957. cuda/cccl/parallel/experimental/typing.py +35 -0
  1958. cuda/cccl/py.typed +0 -0
  1959. cuda_cccl-0.1.3.2.0.dev438.dist-info/METADATA +42 -0
  1960. cuda_cccl-0.1.3.2.0.dev438.dist-info/RECORD +1962 -0
  1961. cuda_cccl-0.1.3.2.0.dev438.dist-info/WHEEL +5 -0
  1962. cuda_cccl-0.1.3.2.0.dev438.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2584 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
31
+ //! sum/scan of items partitioned across a CUDA thread block.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/block/specializations/block_scan_raking.cuh>
46
+ #include <cub/block/specializations/block_scan_warp_scans.cuh>
47
+ #include <cub/util_ptx.cuh>
48
+ #include <cub/util_type.cuh>
49
+
50
+ #include <cuda/std/__functional/operations.h>
51
+ #include <cuda/std/__type_traits/conditional.h>
52
+
53
+ CUB_NAMESPACE_BEGIN
54
+
55
+ /******************************************************************************
56
+ * Algorithmic variants
57
+ ******************************************************************************/
58
+
59
+ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
60
+ //! parallel prefix scan across a CUDA thread block.
61
+ enum BlockScanAlgorithm
62
+ {
63
+
64
+ //! @rst
65
+ //! Overview
66
+ //! ++++++++++++++++++++++++++
67
+ //!
68
+ //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
69
+ //!
70
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
71
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
72
+ //! #. Upsweep sequential reduction in shared memory.
73
+ //! Threads within a single warp rake across segments of shared partial reductions.
74
+ //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
75
+ //! #. Downsweep sequential exclusive scan in shared memory.
76
+ //! Threads within a single warp rake across segments of shared partial reductions,
77
+ //! seeded with the warp-scan output.
78
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
79
+ //! seeded with the raking scan output.
80
+ //!
81
+ //! Performance Considerations
82
+ //! ++++++++++++++++++++++++++
83
+ //!
84
+ //! - Although this variant may suffer longer turnaround latencies when the
85
+ //! GPU is under-occupied, it can often provide higher overall throughput
86
+ //! across the GPU when suitably occupied.
87
+ //!
88
+ //! @endrst
89
+ BLOCK_SCAN_RAKING,
90
+
91
+ //! @rst
92
+ //! Overview
93
+ //! ++++++++++++++++++++++++++
94
+ //!
95
+ //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
96
+ //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
97
+ //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
98
+ //!
99
+ //! @endrst
100
+ BLOCK_SCAN_RAKING_MEMOIZE,
101
+
102
+ //! @rst
103
+ //! Overview
104
+ //! ++++++++++++++++++++++++++
105
+ //!
106
+ //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
107
+ //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
108
+ //! Each thread then places the partial reduction of its item(s) into shared memory.
109
+ //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
110
+ //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
111
+ //! from each preceding warp.
112
+ //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
113
+ //! seeded with the raking scan output.
114
+ //!
115
+ //! Performance Considerations
116
+ //! ++++++++++++++++++++++++++
117
+ //!
118
+ //! - Although this variant may suffer lower overall throughput across the
119
+ //! GPU because due to a heavy reliance on inefficient warpscans, it can
120
+ //! often provide lower turnaround latencies when the GPU is under-occupied.
121
+ //!
122
+ //! @endrst
123
+ BLOCK_SCAN_WARP_SCANS,
124
+ };
125
+
126
+ //! @rst
127
+ //! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
128
+ //! sum/scan of items partitioned across a CUDA thread block.
129
+ //!
130
+ //! Overview
131
+ //! +++++++++++++++++++++++++++++++++++++++++++++
132
+ //!
133
+ //! - Given a list of input elements and a binary reduction operator, a
134
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
135
+ //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
136
+ //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
137
+ //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
138
+ //! the *i*\ :sup:`th` output reduction.
139
+ //! - @rowmajor
140
+ //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
141
+ //!
142
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
143
+ //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
144
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
145
+ //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
146
+ //! register pressure for intermediate storage.
147
+ //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
148
+ //! A quick (low latency) "tiled warpscans" prefix scan algorithm.
149
+ //!
150
+ //! Performance Considerations
151
+ //! +++++++++++++++++++++++++++++++++++++++++++++
152
+ //!
153
+ //! - @granularity
154
+ //! - Uses special instructions when applicable (e.g., warp ``SHFL``)
155
+ //! - Uses synchronization-free communication between warp lanes when applicable
156
+ //! - Invokes a minimal number of minimal block-wide synchronization barriers (only
157
+ //! one or two depending on algorithm selection)
158
+ //! - Incurs zero bank conflicts for most types
159
+ //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
160
+ //!
161
+ //! - Prefix sum variants (vs. generic scan)
162
+ //! - @blocksize
163
+ //!
164
+ //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
165
+ //!
166
+ //! A Simple Example
167
+ //! +++++++++++++++++++++++++++++++++++++++++++++
168
+ //!
169
+ //! @blockcollective{BlockScan}
170
+ //!
171
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
172
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
173
+ //! where each thread owns 4 consecutive items.
174
+ //!
175
+ //! .. code-block:: c++
176
+ //!
177
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
178
+ //!
179
+ //! __global__ void ExampleKernel(...)
180
+ //! {
181
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
182
+ //! using BlockScan = cub::BlockScan<int, 128>;
183
+ //!
184
+ //! // Allocate shared memory for BlockScan
185
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
186
+ //!
187
+ //! // Obtain a segment of consecutive items that are blocked across threads
188
+ //! int thread_data[4];
189
+ //! ...
190
+ //!
191
+ //! // Collectively compute the block-wide exclusive prefix sum
192
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
193
+ //!
194
+ //! Suppose the set of input ``thread_data`` across the block of threads is
195
+ //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
196
+ //! The corresponding output ``thread_data`` in those threads will be
197
+ //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
198
+ //!
199
+ //! Re-using dynamically allocating shared memory
200
+ //! +++++++++++++++++++++++++++++++++++++++++++++
201
+ //!
202
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
203
+ //! BlockReduce and how to re-purpose the same memory region.
204
+ //! This example can be easily adapted to the storage required by BlockScan.
205
+ //!
206
+ //! @endrst
207
+ //!
208
+ //! @tparam T
209
+ //! Data type being scanned
210
+ //!
211
+ //! @tparam BLOCK_DIM_X
212
+ //! The thread block length in threads along the X dimension
213
+ //!
214
+ //! @tparam ALGORITHM
215
+ //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
216
+ //! (default: cub::BLOCK_SCAN_RAKING)
217
+ //!
218
+ //! @tparam BLOCK_DIM_Y
219
+ //! **[optional]** The thread block length in threads along the Y dimension
220
+ //! (default: 1)
221
+ //!
222
+ //! @tparam BLOCK_DIM_Z
223
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
224
+ //!
225
+ template <typename T,
226
+ int BLOCK_DIM_X,
227
+ BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
228
+ int BLOCK_DIM_Y = 1,
229
+ int BLOCK_DIM_Z = 1>
230
+ class BlockScan
231
+ {
232
+ private:
233
+ /// Constants
234
+ enum
235
+ {
236
+ /// The thread block size in threads
237
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
238
+ };
239
+
240
+ /**
241
+ * Ensure the template parameterization meets the requirements of the
242
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
243
+ * cannot be used with thread block sizes not a multiple of the
244
+ * architectural warp size.
245
+ */
246
+ static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
247
+ ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % detail::warp_threads != 0))
248
+ ? BLOCK_SCAN_RAKING
249
+ : ALGORITHM;
250
+
251
+ using WarpScans = detail::BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
252
+ using Raking =
253
+ detail::BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
254
+
255
+ /// Define the delegate type for the desired algorithm
256
+ using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
257
+
258
+ /// Shared memory storage layout type for BlockScan
259
+ using _TempStorage = typename InternalBlockScan::TempStorage;
260
+
261
+ /// Shared storage reference
262
+ _TempStorage& temp_storage;
263
+
264
+ /// Linear thread-id
265
+ unsigned int linear_tid;
266
+
267
+ /// Internal storage allocator
268
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
269
+ {
270
+ __shared__ _TempStorage private_storage;
271
+ return private_storage;
272
+ }
273
+
274
+ public:
275
+ /// @smemstorage{BlockScan}
276
+ struct TempStorage : Uninitialized<_TempStorage>
277
+ {};
278
+
279
+ //! @name Collective constructors
280
+ //! @{
281
+
282
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
283
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
284
+ : temp_storage(PrivateStorage())
285
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
286
+ {}
287
+
288
+ /**
289
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
290
+ *
291
+ * @param[in] temp_storage
292
+ * Reference to memory allocation having layout type TempStorage
293
+ */
294
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
295
+ : temp_storage(temp_storage.Alias())
296
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
297
+ {}
298
+
299
+ //! @} end member group
300
+ //! @name Exclusive prefix sum operations
301
+ //! @{
302
+
303
+ //! @rst
304
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
305
+ //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
306
+ //! to ``output`` in *thread*\ :sub:`0`.
307
+ //!
308
+ //! - @identityzero
309
+ //! - @rowmajor
310
+ //! - @smemreuse
311
+ //!
312
+ //! Snippet
313
+ //! +++++++
314
+ //!
315
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
316
+ //! are partitioned across 128 threads.
317
+ //!
318
+ //! .. code-block:: c++
319
+ //!
320
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
321
+ //!
322
+ //! __global__ void ExampleKernel(...)
323
+ //! {
324
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
325
+ //! using BlockScan = cub::BlockScan<int, 128>;
326
+ //!
327
+ //! // Allocate shared memory for BlockScan
328
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
329
+ //!
330
+ //! // Obtain input item for each thread
331
+ //! int thread_data;
332
+ //! ...
333
+ //!
334
+ //! // Collectively compute the block-wide exclusive prefix sum
335
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
336
+ //!
337
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
338
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
339
+ //!
340
+ //! @endrst
341
+ //!
342
+ //! @param[in] input
343
+ //! Calling thread's input item
344
+ //!
345
+ //! @param[out] output
346
+ //! Calling thread's output item (may be aliased to `input`)
347
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
348
+ {
349
+ T initial_value{};
350
+
351
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
352
+ }
353
+
354
+ //! @rst
355
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
356
+ //! Each thread contributes one input element.
357
+ //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
358
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
359
+ //!
360
+ //! - @identityzero
361
+ //! - @rowmajor
362
+ //! - @smemreuse
363
+ //!
364
+ //! Snippet
365
+ //! +++++++
366
+ //!
367
+ //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
368
+ //! are partitioned across 128 threads.
369
+ //!
370
+ //! .. code-block:: c++
371
+ //!
372
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
373
+ //!
374
+ //! __global__ void ExampleKernel(...)
375
+ //! {
376
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
377
+ //! using BlockScan = cub::BlockScan<int, 128>;
378
+ //!
379
+ //! // Allocate shared memory for BlockScan
380
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
381
+ //!
382
+ //! // Obtain input item for each thread
383
+ //! int thread_data;
384
+ //! ...
385
+ //!
386
+ //! // Collectively compute the block-wide exclusive prefix sum
387
+ //! int block_aggregate;
388
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
389
+ //!
390
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
391
+ //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
392
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
393
+ //!
394
+ //! @endrst
395
+ //!
396
+ //! @param[in] input
397
+ //! Calling thread's input item
398
+ //!
399
+ //! @param[out] output
400
+ //! Calling thread's output item (may be aliased to `input`)
401
+ //!
402
+ //! @param[out] block_aggregate
403
+ //! block-wide aggregate reduction of input items
404
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
405
+ {
406
+ T initial_value{};
407
+
408
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
409
+ }
410
+
411
+ //! @rst
412
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
413
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
414
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
415
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
416
+ //! scan inputs.
417
+ //!
418
+ //! - @identityzero
419
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
420
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
421
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
422
+ //! - @rowmajor
423
+ //! - @smemreuse
424
+ //!
425
+ //! Snippet
426
+ //! +++++++
427
+ //!
428
+ //! The code snippet below illustrates a single thread block that progressively
429
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
430
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
431
+ //! of 128 integer items that are partitioned across 128 threads.
432
+ //!
433
+ //! .. code-block:: c++
434
+ //!
435
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
436
+ //!
437
+ //! // A stateful callback functor that maintains a running prefix to be applied
438
+ //! // during consecutive scan operations.
439
+ //! struct BlockPrefixCallbackOp
440
+ //! {
441
+ //! // Running prefix
442
+ //! int running_total;
443
+ //!
444
+ //! // Constructor
445
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
446
+ //!
447
+ //! // Callback operator to be entered by the first warp of threads in the block.
448
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
449
+ //! __device__ int operator()(int block_aggregate)
450
+ //! {
451
+ //! int old_prefix = running_total;
452
+ //! running_total += block_aggregate;
453
+ //! return old_prefix;
454
+ //! }
455
+ //! };
456
+ //!
457
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
458
+ //! {
459
+ //! // Specialize BlockScan for a 1D block of 128 threads
460
+ //! using BlockScan = cub::BlockScan<int, 128>;
461
+ //!
462
+ //! // Allocate shared memory for BlockScan
463
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
464
+ //!
465
+ //! // Initialize running total
466
+ //! BlockPrefixCallbackOp prefix_op(0);
467
+ //!
468
+ //! // Have the block iterate over segments of items
469
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
470
+ //! {
471
+ //! // Load a segment of consecutive items that are blocked across threads
472
+ //! int thread_data = d_data[block_offset + threadIdx.x];
473
+ //!
474
+ //! // Collectively compute the block-wide exclusive prefix sum
475
+ //! BlockScan(temp_storage).ExclusiveSum(
476
+ //! thread_data, thread_data, prefix_op);
477
+ //! __syncthreads();
478
+ //!
479
+ //! // Store scanned items to output segment
480
+ //! d_data[block_offset + threadIdx.x] = thread_data;
481
+ //! }
482
+ //!
483
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
484
+ //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
485
+ //! The output for the second segment will be ``128, 129, ..., 255``.
486
+ //!
487
+ //! @endrst
488
+ //!
489
+ //! @tparam BlockPrefixCallbackOp
490
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
491
+ //!
492
+ //! @param[in] input
493
+ //! Calling thread's input item
494
+ //!
495
+ //! @param[out] output
496
+ //! Calling thread's output item (may be aliased to `input`)
497
+ //!
498
+ //! @param[in,out] block_prefix_callback_op
499
+ //! @rst
500
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
501
+ //! the logical input sequence.
502
+ //! @endrst
503
+ template <typename BlockPrefixCallbackOp>
504
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
505
+ {
506
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
507
+ }
508
+
509
+ //! @} end member group
510
+ //! @name Exclusive prefix sum operations (multiple data per thread)
511
+ //! @{
512
+
513
+ //! @rst
514
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
515
+ //! Each thread contributes an array of consecutive input elements.
516
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
517
+ //!
518
+ //! - @identityzero
519
+ //! - @blocked
520
+ //! - @granularity
521
+ //! - @smemreuse
522
+ //!
523
+ //! Snippet
524
+ //! +++++++
525
+ //!
526
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
527
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
528
+ //! where each thread owns 4 consecutive items.
529
+ //!
530
+ //! .. code-block:: c++
531
+ //!
532
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
533
+ //!
534
+ //! __global__ void ExampleKernel(...)
535
+ //! {
536
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
537
+ //! using BlockScan = cub::BlockScan<int, 128>;
538
+ //!
539
+ //! // Allocate shared memory for BlockScan
540
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
541
+ //!
542
+ //! // Obtain a segment of consecutive items that are blocked across threads
543
+ //! int thread_data[4];
544
+ //! ...
545
+ //!
546
+ //! // Collectively compute the block-wide exclusive prefix sum
547
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
548
+ //!
549
+ //! Suppose the set of input ``thread_data`` across the block of threads is
550
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
551
+ //! The corresponding output ``thread_data`` in those threads will be
552
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
553
+ //!
554
+ //! @endrst
555
+ //!
556
+ //! @tparam ITEMS_PER_THREAD
557
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
558
+ //!
559
+ //! @param[in] input
560
+ //! Calling thread's input items
561
+ //!
562
+ //! @param[out] output
563
+ //! Calling thread's output items (may be aliased to `input`)
564
+ template <int ITEMS_PER_THREAD>
565
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
566
+ {
567
+ T initial_value{};
568
+
569
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{});
570
+ }
571
+
572
+ //! @rst
573
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
574
+ //! Each thread contributes an array of consecutive input elements.
575
+ //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
576
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
577
+ //!
578
+ //! - @identityzero
579
+ //! - @blocked
580
+ //! - @granularity
581
+ //! - @smemreuse
582
+ //!
583
+ //! Snippet
584
+ //! +++++++
585
+ //!
586
+ //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
587
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
588
+ //! 4 consecutive items.
589
+ //!
590
+ //! .. code-block:: c++
591
+ //!
592
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
593
+ //!
594
+ //! __global__ void ExampleKernel(...)
595
+ //! {
596
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
597
+ //! using BlockScan = cub::BlockScan<int, 128>;
598
+ //!
599
+ //! // Allocate shared memory for BlockScan
600
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
601
+ //!
602
+ //! // Obtain a segment of consecutive items that are blocked across threads
603
+ //! int thread_data[4];
604
+ //! ...
605
+ //!
606
+ //! // Collectively compute the block-wide exclusive prefix sum
607
+ //! int block_aggregate;
608
+ //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
609
+ //!
610
+ //! Suppose the set of input ``thread_data`` across the block of threads is
611
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
612
+ //! The corresponding output ``thread_data`` in those threads will be
613
+ //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
614
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
615
+ //!
616
+ //! @endrst
617
+ //!
618
+ //! @tparam ITEMS_PER_THREAD
619
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
620
+ //!
621
+ //! @param[in] input
622
+ //! Calling thread's input items
623
+ //!
624
+ //! @param[out] output
625
+ //! Calling thread's output items (may be aliased to `input`)
626
+ //!
627
+ //! @param[out] block_aggregate
628
+ //! block-wide aggregate reduction of input items
629
+ template <int ITEMS_PER_THREAD>
630
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
631
+ ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
632
+ {
633
+ // Reduce consecutive thread items in registers
634
+ T initial_value{};
635
+
636
+ ExclusiveScan(input, output, initial_value, ::cuda::std::plus<>{}, block_aggregate);
637
+ }
638
+
639
+ //! @rst
640
+ //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
641
+ //! Each thread contributes an array of consecutive input elements.
642
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
643
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
644
+ //! value that logically prefixes the thread block's scan inputs.
645
+ //!
646
+ //! - @identityzero
647
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
648
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
649
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
650
+ //! - @blocked
651
+ //! - @granularity
652
+ //! - @smemreuse
653
+ //!
654
+ //!
655
+ //! Snippet
656
+ //! +++++++
657
+ //!
658
+ //! The code snippet below illustrates a single thread block that progressively
659
+ //! computes an exclusive prefix sum over multiple "tiles" of input using a
660
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
661
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
662
+ //! across 128 threads where each thread owns 4 consecutive items.
663
+ //!
664
+ //! .. code-block:: c++
665
+ //!
666
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
667
+ //!
668
+ //! // A stateful callback functor that maintains a running prefix to be applied
669
+ //! // during consecutive scan operations.
670
+ //! struct BlockPrefixCallbackOp
671
+ //! {
672
+ //! // Running prefix
673
+ //! int running_total;
674
+ //!
675
+ //! // Constructor
676
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
677
+ //!
678
+ //! // Callback operator to be entered by the first warp of threads in the block.
679
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
680
+ //! __device__ int operator()(int block_aggregate)
681
+ //! {
682
+ //! int old_prefix = running_total;
683
+ //! running_total += block_aggregate;
684
+ //! return old_prefix;
685
+ //! }
686
+ //! };
687
+ //!
688
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
689
+ //! {
690
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
691
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>;
692
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>;
693
+ //! using BlockScan = cub::BlockScan<int, 128>;
694
+ //!
695
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
696
+ //! __shared__ union {
697
+ //! typename BlockLoad::TempStorage load;
698
+ //! typename BlockScan::TempStorage scan;
699
+ //! typename BlockStore::TempStorage store;
700
+ //! } temp_storage;
701
+ //!
702
+ //! // Initialize running total
703
+ //! BlockPrefixCallbackOp prefix_op(0);
704
+ //!
705
+ //! // Have the block iterate over segments of items
706
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
707
+ //! {
708
+ //! // Load a segment of consecutive items that are blocked across threads
709
+ //! int thread_data[4];
710
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
711
+ //! __syncthreads();
712
+ //!
713
+ //! // Collectively compute the block-wide exclusive prefix sum
714
+ //! int block_aggregate;
715
+ //! BlockScan(temp_storage.scan).ExclusiveSum(
716
+ //! thread_data, thread_data, prefix_op);
717
+ //! __syncthreads();
718
+ //!
719
+ //! // Store scanned items to output segment
720
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
721
+ //! __syncthreads();
722
+ //! }
723
+ //!
724
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
725
+ //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
726
+ //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
727
+ //!
728
+ //! @endrst
729
+ //!
730
+ //! @tparam ITEMS_PER_THREAD
731
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
732
+ //!
733
+ //! @tparam BlockPrefixCallbackOp
734
+ //! **[inferred]** Call-back functor type having member
735
+ //! `T operator()(T block_aggregate)`
736
+ //!
737
+ //! @param[in] input
738
+ //! Calling thread's input items
739
+ //!
740
+ //! @param[out] output
741
+ //! Calling thread's output items (may be aliased to `input`)
742
+ //!
743
+ //! @param[in,out] block_prefix_callback_op
744
+ //! @rst
745
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
746
+ //! the logical input sequence.
747
+ //! @endrst
748
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
749
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
750
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
751
+ {
752
+ ExclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
753
+ }
754
+
755
+ //! @} end member group // Exclusive prefix sums (multiple data per thread)
756
+ //! @name Exclusive prefix scan operations
757
+ //! @{
758
+
759
+ //! @rst
760
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
761
+ //! Each thread contributes one input element.
762
+ //!
763
+ //! - Supports non-commutative scan operators.
764
+ //! - @rowmajor
765
+ //! - @smemreuse
766
+ //!
767
+ //! Snippet
768
+ //! +++++++
769
+ //!
770
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
771
+ //! are partitioned across 128 threads.
772
+ //!
773
+ //! .. code-block:: c++
774
+ //!
775
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
776
+ //!
777
+ //! __global__ void ExampleKernel(...)
778
+ //! {
779
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
780
+ //! using BlockScan = cub::BlockScan<int, 128>;
781
+ //!
782
+ //! // Allocate shared memory for BlockScan
783
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
784
+ //!
785
+ //! // Obtain input item for each thread
786
+ //! int thread_data;
787
+ //! ...
788
+ //!
789
+ //! // Collectively compute the block-wide exclusive prefix max scan
790
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
791
+ //!
792
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
793
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
794
+ //!
795
+ //! @endrst
796
+ //!
797
+ //! @tparam ScanOp
798
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
799
+ //!
800
+ //! @param[in] input
801
+ //! Calling thread's input item
802
+ //!
803
+ //! @param[out] output
804
+ //! Calling thread's output item (may be aliased to `input`)
805
+ //!
806
+ //! @param[in] initial_value
807
+ //! @rst
808
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
809
+ //! @endrst
810
+ //!
811
+ //! @param[in] scan_op
812
+ //! Binary scan functor
813
+ template <typename ScanOp>
814
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
815
+ {
816
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
817
+ }
818
+
819
+ //! @rst
820
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
821
+ //! Each thread contributes one input element.
822
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
823
+ //!
824
+ //! - Supports non-commutative scan operators.
825
+ //! - @rowmajor
826
+ //! - @smemreuse
827
+ //!
828
+ //! Snippet
829
+ //! +++++++
830
+ //!
831
+ //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
832
+ //! are partitioned across 128 threads.
833
+ //!
834
+ //! .. code-block:: c++
835
+ //!
836
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
837
+ //!
838
+ //! __global__ void ExampleKernel(...)
839
+ //! {
840
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
841
+ //! using BlockScan = cub::BlockScan<int, 128>;
842
+ //!
843
+ //! // Allocate shared memory for BlockScan
844
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
845
+ //!
846
+ //! // Obtain input item for each thread
847
+ //! int thread_data;
848
+ //! ...
849
+ //!
850
+ //! // Collectively compute the block-wide exclusive prefix max scan
851
+ //! int block_aggregate;
852
+ //! BlockScan(temp_storage).ExclusiveScan(
853
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
854
+ //!
855
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
856
+ //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
857
+ //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
858
+ //!
859
+ //! .. note::
860
+ //!
861
+ //! ``initial_value`` is not applied to the block-wide aggregate.
862
+ //!
863
+ //! @endrst
864
+ //!
865
+ //! @tparam ScanOp
866
+ //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
867
+ //!
868
+ //! @param[in] input
869
+ //! Calling thread's input items
870
+ //!
871
+ //! @param[out] output
872
+ //! Calling thread's output items (may be aliased to ``input``)
873
+ //!
874
+ //! @param[in] initial_value
875
+ //! @rst
876
+ //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`). It is not
877
+ //! taken into account for ``block_aggregate``.
878
+ //!
879
+ //! @endrst
880
+ //!
881
+ //! @param[in] scan_op
882
+ //! Binary scan functor
883
+ //!
884
+ //! @param[out] block_aggregate
885
+ //! block-wide aggregate reduction of input items
886
+ template <typename ScanOp>
887
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
888
+ ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
889
+ {
890
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
891
+ }
892
+
893
+ //! @rst
894
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
895
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
896
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
897
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
898
+ //!
899
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
900
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value from
901
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
902
+ //! - Supports non-commutative scan operators.
903
+ //! - @rowmajor
904
+ //! - @smemreuse
905
+ //!
906
+ //! Snippet
907
+ //! +++++++
908
+ //!
909
+ //! The code snippet below illustrates a single thread block that progressively
910
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
911
+ //! prefix functor to maintain a running total between block-wide scans.
912
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
913
+ //!
914
+ //! .. code-block:: c++
915
+ //!
916
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
917
+ //!
918
+ //! // A stateful callback functor that maintains a running prefix to be applied
919
+ //! // during consecutive scan operations.
920
+ //! struct BlockPrefixCallbackOp
921
+ //! {
922
+ //! // Running prefix
923
+ //! int running_total;
924
+ //!
925
+ //! // Constructor
926
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
927
+ //!
928
+ //! // Callback operator to be entered by the first warp of threads in the block.
929
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
930
+ //! __device__ int operator()(int block_aggregate)
931
+ //! {
932
+ //! int old_prefix = running_total;
933
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
934
+ //! return old_prefix;
935
+ //! }
936
+ //! };
937
+ //!
938
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
939
+ //! {
940
+ //! // Specialize BlockScan for a 1D block of 128 threads
941
+ //! using BlockScan = cub::BlockScan<int, 128>;
942
+ //!
943
+ //! // Allocate shared memory for BlockScan
944
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
945
+ //!
946
+ //! // Initialize running total
947
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
948
+ //!
949
+ //! // Have the block iterate over segments of items
950
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
951
+ //! {
952
+ //! // Load a segment of consecutive items that are blocked across threads
953
+ //! int thread_data = d_data[block_offset + threadIdx.x];
954
+ //!
955
+ //! // Collectively compute the block-wide exclusive prefix max scan
956
+ //! BlockScan(temp_storage).ExclusiveScan(
957
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
958
+ //! __syncthreads();
959
+ //!
960
+ //! // Store scanned items to output segment
961
+ //! d_data[block_offset + threadIdx.x] = thread_data;
962
+ //! }
963
+ //!
964
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
965
+ //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
966
+ //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
967
+ //!
968
+ //! @endrst
969
+ //!
970
+ //! @tparam ScanOp
971
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
972
+ //!
973
+ //! @tparam BlockPrefixCallbackOp
974
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
975
+ //!
976
+ //! @param[in] input
977
+ //! Calling thread's input item
978
+ //!
979
+ //! @param[out] output
980
+ //! Calling thread's output item (may be aliased to `input`)
981
+ //!
982
+ //! @param[in] scan_op
983
+ //! Binary scan functor
984
+ //!
985
+ //! @param[in,out] block_prefix_callback_op
986
+ //! @rst
987
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
988
+ //! the logical input sequence.
989
+ //! @endrst
990
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
991
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
992
+ ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
993
+ {
994
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
995
+ }
996
+
997
+ //! @} end member group // Inclusive prefix sums
998
+ //! @name Exclusive prefix scan operations (multiple data per thread)
999
+ //! @{
1000
+
1001
+ //! @rst
1002
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1003
+ //! Each thread contributes an array of consecutive input elements.
1004
+ //!
1005
+ //! - Supports non-commutative scan operators.
1006
+ //! - @blocked
1007
+ //! - @granularity
1008
+ //! - @smemreuse
1009
+ //!
1010
+ //! Snippet
1011
+ //! +++++++
1012
+ //!
1013
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
1014
+ //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
1015
+ //! across 128 threads where each thread owns 4 consecutive items.
1016
+ //!
1017
+ //! .. code-block:: c++
1018
+ //!
1019
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1020
+ //!
1021
+ //! __global__ void ExampleKernel(...)
1022
+ //! {
1023
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1024
+ //! using BlockScan = cub::BlockScan<int, 128>;
1025
+ //!
1026
+ //! // Allocate shared memory for BlockScan
1027
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1028
+ //!
1029
+ //! // Obtain a segment of consecutive items that are blocked across threads
1030
+ //! int thread_data[4];
1031
+ //! ...
1032
+ //!
1033
+ //! // Collectively compute the block-wide exclusive prefix max scan
1034
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
1035
+ //!
1036
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1037
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1038
+ //! The corresponding output ``thread_data`` in those threads will be
1039
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1040
+ //!
1041
+ //! @endrst
1042
+ //!
1043
+ //! @tparam ITEMS_PER_THREAD
1044
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1045
+ //!
1046
+ //! @tparam ScanOp
1047
+ //! **[inferred]** Binary scan functor type having member
1048
+ //! `T operator()(const T &a, const T &b)`
1049
+ //!
1050
+ //! @param[in] input
1051
+ //! Calling thread's input items
1052
+ //!
1053
+ //! @param[out] output
1054
+ //! Calling thread's output items (may be aliased to `input`)
1055
+ //!
1056
+ //! @param[in] initial_value
1057
+ //! @rst
1058
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
1059
+ //! @endrst
1060
+ //!
1061
+ //! @param[in] scan_op
1062
+ //! Binary scan functor
1063
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1064
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1065
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
1066
+ {
1067
+ // Reduce consecutive thread items in registers
1068
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1069
+
1070
+ // Exclusive thread block-scan
1071
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
1072
+
1073
+ // Exclusive scan in registers with prefix as seed
1074
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1075
+ }
1076
+
1077
+ //! @rst
1078
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1079
+ //! Each thread contributes an array of consecutive input elements.
1080
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1081
+ //!
1082
+ //! - Supports non-commutative scan operators.
1083
+ //! - @blocked
1084
+ //! - @granularity
1085
+ //! - @smemreuse
1086
+ //!
1087
+ //! Snippet
1088
+ //! +++++++
1089
+ //!
1090
+ //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
1091
+ //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
1092
+ //! 4 consecutive items.
1093
+ //!
1094
+ //! .. code-block:: c++
1095
+ //!
1096
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1097
+ //!
1098
+ //! __global__ void ExampleKernel(...)
1099
+ //! {
1100
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1101
+ //! using BlockScan = cub::BlockScan<int, 128>;
1102
+ //!
1103
+ //! // Allocate shared memory for BlockScan
1104
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1105
+ //!
1106
+ //! // Obtain a segment of consecutive items that are blocked across threads
1107
+ //! int thread_data[4];
1108
+ //! ...
1109
+ //!
1110
+ //! // Collectively compute the block-wide exclusive prefix max scan
1111
+ //! int block_aggregate;
1112
+ //! BlockScan(temp_storage).ExclusiveScan(
1113
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
1114
+ //!
1115
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1116
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
1117
+ //! The corresponding output ``thread_data`` in those threads will be
1118
+ //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
1119
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
1120
+ //!
1121
+ //! .. note::
1122
+ //!
1123
+ //! ``initial_value`` is not applied to the block-wide aggregate.
1124
+ //!
1125
+ //! @endrst
1126
+ //!
1127
+ //! @tparam ITEMS_PER_THREAD
1128
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1129
+ //!
1130
+ //! @tparam ScanOp
1131
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1132
+ //!
1133
+ //! @param[in] input
1134
+ //! Calling thread's input items
1135
+ //!
1136
+ //! @param[out] output
1137
+ //! Calling thread's output items (may be aliased to `input`)
1138
+ //!
1139
+ //! @param[in] initial_value
1140
+ //! @rst
1141
+ //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`). It is not taken
1142
+ //! into account for ``block_aggregate``.
1143
+ //! @endrst
1144
+ //!
1145
+ //! @param[in] scan_op
1146
+ //! Binary scan functor
1147
+ //!
1148
+ //! @param[out] block_aggregate
1149
+ //! block-wide aggregate reduction of input items
1150
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1151
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1152
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
1153
+ {
1154
+ // Reduce consecutive thread items in registers
1155
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1156
+
1157
+ // Exclusive thread block-scan
1158
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
1159
+
1160
+ // Exclusive scan in registers with prefix as seed
1161
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1162
+ }
1163
+
1164
+ //! @rst
1165
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1166
+ //! Each thread contributes an array of consecutive input elements.
1167
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
1168
+ //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
1169
+ //! block's scan inputs.
1170
+ //!
1171
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1172
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the
1173
+ //! first warp of threads in the block, however only the return value from
1174
+ //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1175
+ //! - Supports non-commutative scan operators.
1176
+ //! - @blocked
1177
+ //! - @granularity
1178
+ //! - @smemreuse
1179
+ //!
1180
+ //! Snippet
1181
+ //! +++++++
1182
+ //!
1183
+ //! The code snippet below illustrates a single thread block that progressively
1184
+ //! computes an exclusive prefix max scan over multiple "tiles" of input using a
1185
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1186
+ //! of 128 integer items that are partitioned across 128 threads.
1187
+ //!
1188
+ //! .. code-block:: c++
1189
+ //!
1190
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1191
+ //!
1192
+ //! // A stateful callback functor that maintains a running prefix to be applied
1193
+ //! // during consecutive scan operations.
1194
+ //! struct BlockPrefixCallbackOp
1195
+ //! {
1196
+ //! // Running prefix
1197
+ //! int running_total;
1198
+ //!
1199
+ //! // Constructor
1200
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1201
+ //!
1202
+ //! // Callback operator to be entered by the first warp of threads in the block.
1203
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1204
+ //! __device__ int operator()(int block_aggregate)
1205
+ //! {
1206
+ //! int old_prefix = running_total;
1207
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
1208
+ //! return old_prefix;
1209
+ //! }
1210
+ //! };
1211
+ //!
1212
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1213
+ //! {
1214
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1215
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1216
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1217
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1218
+ //!
1219
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1220
+ //! __shared__ union {
1221
+ //! typename BlockLoad::TempStorage load;
1222
+ //! typename BlockScan::TempStorage scan;
1223
+ //! typename BlockStore::TempStorage store;
1224
+ //! } temp_storage;
1225
+ //!
1226
+ //! // Initialize running total
1227
+ //! BlockPrefixCallbackOp prefix_op(0);
1228
+ //!
1229
+ //! // Have the block iterate over segments of items
1230
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1231
+ //! {
1232
+ //! // Load a segment of consecutive items that are blocked across threads
1233
+ //! int thread_data[4];
1234
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1235
+ //! __syncthreads();
1236
+ //!
1237
+ //! // Collectively compute the block-wide exclusive prefix max scan
1238
+ //! BlockScan(temp_storage.scan).ExclusiveScan(
1239
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, prefix_op);
1240
+ //! __syncthreads();
1241
+ //!
1242
+ //! // Store scanned items to output segment
1243
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1244
+ //! __syncthreads();
1245
+ //! }
1246
+ //!
1247
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
1248
+ //! The corresponding output for the first segment will be
1249
+ //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
1250
+ //! The output for the second segment will be
1251
+ //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
1252
+ //!
1253
+ //! @endrst
1254
+ //!
1255
+ //! @tparam ITEMS_PER_THREAD
1256
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1257
+ //!
1258
+ //! @tparam ScanOp
1259
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1260
+ //!
1261
+ //! @tparam BlockPrefixCallbackOp
1262
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1263
+ //!
1264
+ //! @param[in] input
1265
+ //! Calling thread's input items
1266
+ //!
1267
+ //! @param[out] output
1268
+ //! Calling thread's output items (may be aliased to `input`)
1269
+ //!
1270
+ //! @param[in] scan_op
1271
+ //! Binary scan functor
1272
+ //!
1273
+ //! @param[in,out] block_prefix_callback_op
1274
+ //! @rst
1275
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
1276
+ //! the logical input sequence.
1277
+ //! @endrst
1278
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
1279
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
1280
+ T (&input)[ITEMS_PER_THREAD],
1281
+ T (&output)[ITEMS_PER_THREAD],
1282
+ ScanOp scan_op,
1283
+ BlockPrefixCallbackOp& block_prefix_callback_op)
1284
+ {
1285
+ // Reduce consecutive thread items in registers
1286
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1287
+
1288
+ // Exclusive thread block-scan
1289
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
1290
+
1291
+ // Exclusive scan in registers with prefix as seed
1292
+ detail::ThreadScanExclusive(input, output, scan_op, thread_prefix);
1293
+ }
1294
+
1295
+ //! @} end member group
1296
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1297
+
1298
+ //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
1299
+ //! @{
1300
+
1301
+ //! @rst
1302
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1303
+ //! Each thread contributes one input element.
1304
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1305
+ //!
1306
+ //! - Supports non-commutative scan operators.
1307
+ //! - @rowmajor
1308
+ //! - @smemreuse
1309
+ //!
1310
+ //! @endrst
1311
+ //!
1312
+ //! @tparam ScanOp
1313
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1314
+ //!
1315
+ //! @param[in] input
1316
+ //! Calling thread's input item
1317
+ //!
1318
+ //! @param[out] output
1319
+ //! Calling thread's output item (may be aliased to `input`)
1320
+ //!
1321
+ //! @param[in] scan_op
1322
+ //! Binary scan functor
1323
+ template <typename ScanOp>
1324
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
1325
+ {
1326
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
1327
+ }
1328
+
1329
+ //! @rst
1330
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1331
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1332
+ //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
1333
+ //! *thread*\ :sub:`0` is undefined.
1334
+ //!
1335
+ //! - Supports non-commutative scan operators.
1336
+ //! - @rowmajor
1337
+ //! - @smemreuse
1338
+ //!
1339
+ //! @endrst
1340
+ //!
1341
+ //! @tparam ScanOp
1342
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1343
+ //!
1344
+ //! @param[in] input
1345
+ //! Calling thread's input item
1346
+ //!
1347
+ //! @param[out] output
1348
+ //! Calling thread's output item (may be aliased to `input`)
1349
+ //!
1350
+ //! @param[in] scan_op
1351
+ //! Binary scan functor
1352
+ //!
1353
+ //! @param[out] block_aggregate
1354
+ //! block-wide aggregate reduction of input items
1355
+ template <typename ScanOp>
1356
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
1357
+ {
1358
+ InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1359
+ }
1360
+
1361
+ //! @} end member group // Exclusive prefix scans (no initial value, single datum per thread)
1362
+ //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
1363
+ //! @{
1364
+
1365
+ //! @rst
1366
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1367
+ //! Each thread contributes an array of consecutive input elements. With no initial value, the
1368
+ //! output computed for *thread*\ :sub:`0` is undefined.
1369
+ //!
1370
+ //! - Supports non-commutative scan operators.
1371
+ //! - @blocked
1372
+ //! - @granularity
1373
+ //! - @smemreuse
1374
+ //!
1375
+ //! @endrst
1376
+ //!
1377
+ //! @tparam ITEMS_PER_THREAD
1378
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1379
+ //!
1380
+ //! @tparam ScanOp
1381
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1382
+ //!
1383
+ //! @param[in] input
1384
+ //! Calling thread's input items
1385
+ //!
1386
+ //! @param[out] output
1387
+ //! Calling thread's output items (may be aliased to `input`)
1388
+ //!
1389
+ //! @param[in] scan_op
1390
+ //! Binary scan functor
1391
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1392
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1393
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
1394
+ {
1395
+ // Reduce consecutive thread items in registers
1396
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1397
+
1398
+ // Exclusive thread block-scan
1399
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
1400
+
1401
+ // Exclusive scan in registers with prefix
1402
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1403
+ }
1404
+
1405
+ //! @rst
1406
+ //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1407
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
1408
+ //! with the block-wide ``block_aggregate`` of all inputs.
1409
+ //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
1410
+ //!
1411
+ //! - Supports non-commutative scan operators.
1412
+ //! - @blocked
1413
+ //! - @granularity
1414
+ //! - @smemreuse
1415
+ //!
1416
+ //! @endrst
1417
+ //!
1418
+ //! @tparam ITEMS_PER_THREAD
1419
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1420
+ //!
1421
+ //! @tparam ScanOp
1422
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1423
+ //!
1424
+ //! @param[in] input
1425
+ //! Calling thread's input items
1426
+ //!
1427
+ //! @param[out] output
1428
+ //! Calling thread's output items (may be aliased to `input`)
1429
+ //!
1430
+ //! @param[in] scan_op
1431
+ //! Binary scan functor
1432
+ //!
1433
+ //! @param[out] block_aggregate
1434
+ //! block-wide aggregate reduction of input items
1435
+ template <int ITEMS_PER_THREAD, typename ScanOp>
1436
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1437
+ ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
1438
+ {
1439
+ // Reduce consecutive thread items in registers
1440
+ T thread_partial = cub::ThreadReduce(input, scan_op);
1441
+
1442
+ // Exclusive thread block-scan
1443
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1444
+
1445
+ // Exclusive scan in registers with prefix
1446
+ detail::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1447
+ }
1448
+
1449
+ //! @} end member group // Exclusive prefix scans (no initial value, multiple data per thread)
1450
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
1451
+
1452
+ //! @name Inclusive prefix sum operations
1453
+ //! @{
1454
+
1455
+ //! @rst
1456
+ //! Computes an inclusive block-wide prefix scan using addition (+)
1457
+ //! as the scan operator. Each thread contributes one input element.
1458
+ //!
1459
+ //! - @rowmajor
1460
+ //! - @smemreuse
1461
+ //!
1462
+ //! Snippet
1463
+ //! +++++++
1464
+ //!
1465
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1466
+ //! are partitioned across 128 threads.
1467
+ //!
1468
+ //! .. code-block:: c++
1469
+ //!
1470
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1471
+ //!
1472
+ //! __global__ void ExampleKernel(...)
1473
+ //! {
1474
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1475
+ //! using BlockScan = cub::BlockScan<int, 128>;
1476
+ //!
1477
+ //! // Allocate shared memory for BlockScan
1478
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1479
+ //!
1480
+ //! // Obtain input item for each thread
1481
+ //! int thread_data;
1482
+ //! ...
1483
+ //!
1484
+ //! // Collectively compute the block-wide inclusive prefix sum
1485
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1486
+ //!
1487
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1488
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1489
+ //!
1490
+ //! @endrst
1491
+ //!
1492
+ //! @param[in] input
1493
+ //! Calling thread's input item
1494
+ //!
1495
+ //! @param[out] output
1496
+ //! Calling thread's output item (may be aliased to `input`)
1497
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
1498
+ {
1499
+ InclusiveScan(input, output, ::cuda::std::plus<>{});
1500
+ }
1501
+
1502
+ //! @rst
1503
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1504
+ //! Each thread contributes one input element.
1505
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1506
+ //!
1507
+ //! - @rowmajor
1508
+ //! - @smemreuse
1509
+ //!
1510
+ //! Snippet
1511
+ //! +++++++
1512
+ //!
1513
+ //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
1514
+ //! are partitioned across 128 threads.
1515
+ //!
1516
+ //! .. code-block:: c++
1517
+ //!
1518
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1519
+ //!
1520
+ //! __global__ void ExampleKernel(...)
1521
+ //! {
1522
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1523
+ //! using BlockScan = cub::BlockScan<int, 128>;
1524
+ //!
1525
+ //! // Allocate shared memory for BlockScan
1526
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1527
+ //!
1528
+ //! // Obtain input item for each thread
1529
+ //! int thread_data;
1530
+ //! ...
1531
+ //!
1532
+ //! // Collectively compute the block-wide inclusive prefix sum
1533
+ //! int block_aggregate;
1534
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1535
+ //!
1536
+ //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
1537
+ //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
1538
+ //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
1539
+ //!
1540
+ //! @endrst
1541
+ //!
1542
+ //! @param[in] input
1543
+ //! Calling thread's input item
1544
+ //!
1545
+ //! @param[out] output
1546
+ //! Calling thread's output item (may be aliased to `input`)
1547
+ //!
1548
+ //! @param[out] block_aggregate
1549
+ //! block-wide aggregate reduction of input items
1550
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
1551
+ {
1552
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_aggregate);
1553
+ }
1554
+
1555
+ //! @rst
1556
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1557
+ //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
1558
+ //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
1559
+ //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
1560
+ //! scan inputs.
1561
+ //!
1562
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1563
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1564
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1565
+ //! - @rowmajor
1566
+ //! - @smemreuse
1567
+ //!
1568
+ //! Snippet
1569
+ //! +++++++
1570
+ //!
1571
+ //! The code snippet below illustrates a single thread block that progressively
1572
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1573
+ //! prefix functor to maintain a running total between block-wide scans.
1574
+ //! Each tile consists of 128 integer items that are partitioned across 128 threads.
1575
+ //!
1576
+ //! .. code-block:: c++
1577
+ //!
1578
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1579
+ //!
1580
+ //! // A stateful callback functor that maintains a running prefix to be applied
1581
+ //! // during consecutive scan operations.
1582
+ //! struct BlockPrefixCallbackOp
1583
+ //! {
1584
+ //! // Running prefix
1585
+ //! int running_total;
1586
+ //!
1587
+ //! // Constructor
1588
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1589
+ //!
1590
+ //! // Callback operator to be entered by the first warp of threads in the block.
1591
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1592
+ //! __device__ int operator()(int block_aggregate)
1593
+ //! {
1594
+ //! int old_prefix = running_total;
1595
+ //! running_total += block_aggregate;
1596
+ //! return old_prefix;
1597
+ //! }
1598
+ //! };
1599
+ //!
1600
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1601
+ //! {
1602
+ //! // Specialize BlockScan for a 1D block of 128 threads
1603
+ //! using BlockScan = cub::BlockScan<int, 128>;
1604
+ //!
1605
+ //! // Allocate shared memory for BlockScan
1606
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1607
+ //!
1608
+ //! // Initialize running total
1609
+ //! BlockPrefixCallbackOp prefix_op(0);
1610
+ //!
1611
+ //! // Have the block iterate over segments of items
1612
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
1613
+ //! {
1614
+ //! // Load a segment of consecutive items that are blocked across threads
1615
+ //! int thread_data = d_data[block_offset + threadIdx.x];
1616
+ //!
1617
+ //! // Collectively compute the block-wide inclusive prefix sum
1618
+ //! BlockScan(temp_storage).InclusiveSum(
1619
+ //! thread_data, thread_data, prefix_op);
1620
+ //! __syncthreads();
1621
+ //!
1622
+ //! // Store scanned items to output segment
1623
+ //! d_data[block_offset + threadIdx.x] = thread_data;
1624
+ //! }
1625
+ //!
1626
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1627
+ //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
1628
+ //! The output for the second segment will be ``129, 130, ..., 256``.
1629
+ //!
1630
+ //! @endrst
1631
+ //!
1632
+ //! @tparam BlockPrefixCallbackOp
1633
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1634
+ //!
1635
+ //! @param[in] input
1636
+ //! Calling thread's input item
1637
+ //!
1638
+ //! @param[out] output
1639
+ //! Calling thread's output item (may be aliased to `input`)
1640
+ //!
1641
+ //! @param[in,out] block_prefix_callback_op
1642
+ //! @rst
1643
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
1644
+ //! to the logical input sequence.
1645
+ //! @endrst
1646
+ template <typename BlockPrefixCallbackOp>
1647
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
1648
+ {
1649
+ InclusiveScan(input, output, ::cuda::std::plus<>{}, block_prefix_callback_op);
1650
+ }
1651
+
1652
+ //! @} end member group
1653
+ //! @name Inclusive prefix sum operations (multiple data per thread)
1654
+ //! @{
1655
+
1656
+ //! @rst
1657
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1658
+ //! Each thread contributes an array of consecutive input elements.
1659
+ //!
1660
+ //! - @blocked
1661
+ //! - @granularity
1662
+ //! - @smemreuse
1663
+ //!
1664
+ //! Snippet
1665
+ //! +++++++
1666
+ //!
1667
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1668
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1669
+ //! where each thread owns 4 consecutive items.
1670
+ //!
1671
+ //! .. code-block:: c++
1672
+ //!
1673
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1674
+ //!
1675
+ //! __global__ void ExampleKernel(...)
1676
+ //! {
1677
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1678
+ //! using BlockScan = cub::BlockScan<int, 128>;
1679
+ //!
1680
+ //! // Allocate shared memory for BlockScan
1681
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1682
+ //!
1683
+ //! // Obtain a segment of consecutive items that are blocked across threads
1684
+ //! int thread_data[4];
1685
+ //! ...
1686
+ //!
1687
+ //! // Collectively compute the block-wide inclusive prefix sum
1688
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
1689
+ //!
1690
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1691
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
1692
+ //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1693
+ //!
1694
+ //! @endrst
1695
+ //!
1696
+ //! @tparam ITEMS_PER_THREAD
1697
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1698
+ //!
1699
+ //! @param[in] input
1700
+ //! Calling thread's input items
1701
+ //!
1702
+ //! @param[out] output
1703
+ //! Calling thread's output items (may be aliased to `input`)
1704
+ template <int ITEMS_PER_THREAD>
1705
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
1706
+ {
1707
+ if (ITEMS_PER_THREAD == 1)
1708
+ {
1709
+ InclusiveSum(input[0], output[0]);
1710
+ }
1711
+ else
1712
+ {
1713
+ // Reduce consecutive thread items in registers
1714
+ ::cuda::std::plus<> scan_op;
1715
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1716
+
1717
+ // Exclusive thread block-scan
1718
+ ExclusiveSum(thread_prefix, thread_prefix);
1719
+
1720
+ // Inclusive scan in registers with prefix as seed
1721
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1722
+ }
1723
+ }
1724
+
1725
+ //! @rst
1726
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1727
+ //! Each thread contributes an array of consecutive input elements.
1728
+ //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
1729
+ //!
1730
+ //! - @blocked
1731
+ //! - @granularity
1732
+ //! - @smemreuse
1733
+ //!
1734
+ //! Snippet
1735
+ //! +++++++
1736
+ //!
1737
+ //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
1738
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1739
+ //! where each thread owns 4 consecutive items.
1740
+ //!
1741
+ //! .. code-block:: c++
1742
+ //!
1743
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1744
+ //!
1745
+ //! __global__ void ExampleKernel(...)
1746
+ //! {
1747
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1748
+ //! using BlockScan = cub::BlockScan<int, 128>;
1749
+ //!
1750
+ //! // Allocate shared memory for BlockScan
1751
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1752
+ //!
1753
+ //! // Obtain a segment of consecutive items that are blocked across threads
1754
+ //! int thread_data[4];
1755
+ //! ...
1756
+ //!
1757
+ //! // Collectively compute the block-wide inclusive prefix sum
1758
+ //! int block_aggregate;
1759
+ //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
1760
+ //!
1761
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1762
+ //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
1763
+ //! corresponding output ``thread_data`` in those threads will be
1764
+ //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
1765
+ //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
1766
+ //!
1767
+ //! @endrst
1768
+ //!
1769
+ //! @tparam ITEMS_PER_THREAD
1770
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1771
+ //!
1772
+ //! @param[in] input
1773
+ //! Calling thread's input items
1774
+ //!
1775
+ //! @param[out] output
1776
+ //! Calling thread's output items (may be aliased to `input`)
1777
+ //!
1778
+ //! @param[out] block_aggregate
1779
+ //! block-wide aggregate reduction of input items
1780
+ template <int ITEMS_PER_THREAD>
1781
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1782
+ InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
1783
+ {
1784
+ if (ITEMS_PER_THREAD == 1)
1785
+ {
1786
+ InclusiveSum(input[0], output[0], block_aggregate);
1787
+ }
1788
+ else
1789
+ {
1790
+ // Reduce consecutive thread items in registers
1791
+ ::cuda::std::plus<> scan_op;
1792
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1793
+
1794
+ // Exclusive thread block-scan
1795
+ ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
1796
+
1797
+ // Inclusive scan in registers with prefix as seed
1798
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
1799
+ }
1800
+ }
1801
+
1802
+ //! @rst
1803
+ //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
1804
+ //! Each thread contributes an array of consecutive input elements.
1805
+ //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
1806
+ //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
1807
+ //! value that logically prefixes the thread block's scan inputs.
1808
+ //!
1809
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
1810
+ //! ``T operator()(T block_aggregate)``. The functor will be invoked by the first warp of threads in the block,
1811
+ //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
1812
+ //! - @blocked
1813
+ //! - @granularity
1814
+ //! - @smemreuse
1815
+ //!
1816
+ //! Snippet
1817
+ //! +++++++
1818
+ //!
1819
+ //! The code snippet below illustrates a single thread block that progressively
1820
+ //! computes an inclusive prefix sum over multiple "tiles" of input using a
1821
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
1822
+ //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
1823
+ //! across 128 threads where each thread owns 4 consecutive items.
1824
+ //!
1825
+ //! .. code-block:: c++
1826
+ //!
1827
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1828
+ //!
1829
+ //! // A stateful callback functor that maintains a running prefix to be applied
1830
+ //! // during consecutive scan operations.
1831
+ //! struct BlockPrefixCallbackOp
1832
+ //! {
1833
+ //! // Running prefix
1834
+ //! int running_total;
1835
+ //!
1836
+ //! // Constructor
1837
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
1838
+ //!
1839
+ //! // Callback operator to be entered by the first warp of threads in the block.
1840
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
1841
+ //! __device__ int operator()(int block_aggregate)
1842
+ //! {
1843
+ //! int old_prefix = running_total;
1844
+ //! running_total += block_aggregate;
1845
+ //! return old_prefix;
1846
+ //! }
1847
+ //! };
1848
+ //!
1849
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
1850
+ //! {
1851
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
1852
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
1853
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
1854
+ //! using BlockScan = cub::BlockScan<int, 128> ;
1855
+ //!
1856
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
1857
+ //! __shared__ union {
1858
+ //! typename BlockLoad::TempStorage load;
1859
+ //! typename BlockScan::TempStorage scan;
1860
+ //! typename BlockStore::TempStorage store;
1861
+ //! } temp_storage;
1862
+ //!
1863
+ //! // Initialize running total
1864
+ //! BlockPrefixCallbackOp prefix_op(0);
1865
+ //!
1866
+ //! // Have the block iterate over segments of items
1867
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
1868
+ //! {
1869
+ //! // Load a segment of consecutive items that are blocked across threads
1870
+ //! int thread_data[4];
1871
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
1872
+ //! __syncthreads();
1873
+ //!
1874
+ //! // Collectively compute the block-wide inclusive prefix sum
1875
+ //! BlockScan(temp_storage.scan).IncluisveSum(
1876
+ //! thread_data, thread_data, prefix_op);
1877
+ //! __syncthreads();
1878
+ //!
1879
+ //! // Store scanned items to output segment
1880
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
1881
+ //! __syncthreads();
1882
+ //! }
1883
+ //!
1884
+ //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
1885
+ //! The corresponding output for the first segment will be
1886
+ //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
1887
+ //! ``513, 514, 515, 516, ..., 1023, 1024``.
1888
+ //!
1889
+ //! @endrst
1890
+ //!
1891
+ //! @tparam ITEMS_PER_THREAD
1892
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
1893
+ //!
1894
+ //! @tparam BlockPrefixCallbackOp
1895
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
1896
+ //!
1897
+ //! @param[in] input
1898
+ //! Calling thread's input items
1899
+ //!
1900
+ //! @param[out] output
1901
+ //! Calling thread's output items (may be aliased to `input`)
1902
+ //!
1903
+ //! @param[in,out] block_prefix_callback_op
1904
+ //! @rst
1905
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
1906
+ //! logical input sequence.
1907
+ //! @endrst
1908
+ template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
1909
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
1910
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
1911
+ {
1912
+ if (ITEMS_PER_THREAD == 1)
1913
+ {
1914
+ InclusiveSum(input[0], output[0], block_prefix_callback_op);
1915
+ }
1916
+ else
1917
+ {
1918
+ // Reduce consecutive thread items in registers
1919
+ ::cuda::std::plus<> scan_op;
1920
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
1921
+
1922
+ // Exclusive thread block-scan
1923
+ ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
1924
+
1925
+ // Inclusive scan in registers with prefix as seed
1926
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
1927
+ }
1928
+ }
1929
+
1930
+ //! @} end member group
1931
+ //! @name Inclusive prefix scan operations
1932
+ //! @{
1933
+
1934
+ //! @rst
1935
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1936
+ //! Each thread contributes one input element.
1937
+ //!
1938
+ //! - Supports non-commutative scan operators.
1939
+ //! - @rowmajor
1940
+ //! - @smemreuse
1941
+ //!
1942
+ //! Snippet
1943
+ //! +++++++
1944
+ //!
1945
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
1946
+ //! are partitioned across 128 threads.
1947
+ //!
1948
+ //! .. code-block:: c++
1949
+ //!
1950
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
1951
+ //!
1952
+ //! __global__ void ExampleKernel(...)
1953
+ //! {
1954
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
1955
+ //! using BlockScan = cub::BlockScan<int, 128>;
1956
+ //!
1957
+ //! // Allocate shared memory for BlockScan
1958
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
1959
+ //!
1960
+ //! // Obtain input item for each thread
1961
+ //! int thread_data;
1962
+ //! ...
1963
+ //!
1964
+ //! // Collectively compute the block-wide inclusive prefix max scan
1965
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
1966
+ //!
1967
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1968
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
1969
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
1970
+ //!
1971
+ //! @endrst
1972
+ //!
1973
+ //! @tparam ScanOp
1974
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
1975
+ //!
1976
+ //! @param[in] input
1977
+ //! Calling thread's input item
1978
+ //!
1979
+ //! @param[out] output
1980
+ //! Calling thread's output item (may be aliased to `input`)
1981
+ //!
1982
+ //! @param[in] scan_op
1983
+ //! Binary scan functor
1984
+ template <typename ScanOp>
1985
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
1986
+ {
1987
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
1988
+ }
1989
+
1990
+ //! @rst
1991
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
1992
+ //! Each thread contributes one input element. Also provides every thread with the block-wide
1993
+ //! ``block_aggregate`` of all inputs.
1994
+ //!
1995
+ //! - Supports non-commutative scan operators.
1996
+ //! - @rowmajor
1997
+ //! - @smemreuse
1998
+ //!
1999
+ //! Snippet
2000
+ //! +++++++
2001
+ //!
2002
+ //! The code snippet below illustrates an inclusive prefix max scan of 128
2003
+ //! integer items that are partitioned across 128 threads.
2004
+ //!
2005
+ //! .. code-block:: c++
2006
+ //!
2007
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2008
+ //!
2009
+ //! __global__ void ExampleKernel(...)
2010
+ //! {
2011
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2012
+ //! using BlockScan = cub::BlockScan<int, 128>;
2013
+ //!
2014
+ //! // Allocate shared memory for BlockScan
2015
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2016
+ //!
2017
+ //! // Obtain input item for each thread
2018
+ //! int thread_data;
2019
+ //! ...
2020
+ //!
2021
+ //! // Collectively compute the block-wide inclusive prefix max scan
2022
+ //! int block_aggregate;
2023
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2024
+ //!
2025
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2026
+ //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
2027
+ //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
2028
+ //! ``126`` will be stored in ``block_aggregate`` for all threads.
2029
+ //!
2030
+ //! @endrst
2031
+ //!
2032
+ //! @tparam ScanOp
2033
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2034
+ //!
2035
+ //! @param[in] input
2036
+ //! Calling thread's input item
2037
+ //!
2038
+ //! @param[out] output
2039
+ //! Calling thread's output item (may be aliased to `input`)
2040
+ //!
2041
+ //! @param[in] scan_op
2042
+ //! Binary scan functor
2043
+ //!
2044
+ //! @param[out] block_aggregate
2045
+ //! Block-wide aggregate reduction of input items
2046
+ template <typename ScanOp>
2047
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
2048
+ {
2049
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
2050
+ }
2051
+
2052
+ //! @rst
2053
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2054
+ //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
2055
+ //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
2056
+ //! the "seed" value that logically prefixes the thread block's scan inputs.
2057
+ //!
2058
+ //! - The ``block_prefix_callback_op`` functor must implement a member function
2059
+ //! ``T operator()(T block_aggregate)``. The functor's input parameter
2060
+ //! The functor will be invoked by the first warp of threads in the block,
2061
+ //! however only the return value from *lane*\ :sub:`0` is applied
2062
+ //! as the block-wide prefix. Can be stateful.
2063
+ //! - Supports non-commutative scan operators.
2064
+ //! - @rowmajor
2065
+ //! - @smemreuse
2066
+ //!
2067
+ //! Snippet
2068
+ //! +++++++
2069
+ //!
2070
+ //! The code snippet below illustrates a single thread block that progressively
2071
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2072
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2073
+ //! of 128 integer items that are partitioned across 128 threads.
2074
+ //!
2075
+ //! .. code-block:: c++
2076
+ //!
2077
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2078
+ //!
2079
+ //! // A stateful callback functor that maintains a running prefix to be applied
2080
+ //! // during consecutive scan operations.
2081
+ //! struct BlockPrefixCallbackOp
2082
+ //! {
2083
+ //! // Running prefix
2084
+ //! int running_total;
2085
+ //!
2086
+ //! // Constructor
2087
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2088
+ //!
2089
+ //! // Callback operator to be entered by the first warp of threads in the block.
2090
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2091
+ //! __device__ int operator()(int block_aggregate)
2092
+ //! {
2093
+ //! int old_prefix = running_total;
2094
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2095
+ //! return old_prefix;
2096
+ //! }
2097
+ //! };
2098
+ //!
2099
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2100
+ //! {
2101
+ //! // Specialize BlockScan for a 1D block of 128 threads
2102
+ //! using BlockScan = cub::BlockScan<int, 128>;
2103
+ //!
2104
+ //! // Allocate shared memory for BlockScan
2105
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2106
+ //!
2107
+ //! // Initialize running total
2108
+ //! BlockPrefixCallbackOp prefix_op(INT_MIN);
2109
+ //!
2110
+ //! // Have the block iterate over segments of items
2111
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128)
2112
+ //! {
2113
+ //! // Load a segment of consecutive items that are blocked across threads
2114
+ //! int thread_data = d_data[block_offset + threadIdx.x];
2115
+ //!
2116
+ //! // Collectively compute the block-wide inclusive prefix max scan
2117
+ //! BlockScan(temp_storage).InclusiveScan(
2118
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2119
+ //! __syncthreads();
2120
+ //!
2121
+ //! // Store scanned items to output segment
2122
+ //! d_data[block_offset + threadIdx.x] = thread_data;
2123
+ //! }
2124
+ //!
2125
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2126
+ //! The corresponding output for the first segment will be
2127
+ //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
2128
+ //! will be ``128, 128, 130, 130, ..., 254, 254``.
2129
+ //!
2130
+ //! @endrst
2131
+ //!
2132
+ //! @tparam ScanOp
2133
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2134
+ //!
2135
+ //! @tparam BlockPrefixCallbackOp
2136
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2137
+ //!
2138
+ //! @param[in] input
2139
+ //! Calling thread's input item
2140
+ //!
2141
+ //! @param[out] output
2142
+ //! Calling thread's output item (may be aliased to `input`)
2143
+ //!
2144
+ //! @param[in] scan_op
2145
+ //! Binary scan functor
2146
+ //!
2147
+ //! @param[in,out] block_prefix_callback_op
2148
+ //! @rst
2149
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2150
+ //! the logical input sequence.
2151
+ //! @endrst
2152
+ template <typename ScanOp, typename BlockPrefixCallbackOp>
2153
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2154
+ InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
2155
+ {
2156
+ InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
2157
+ }
2158
+
2159
+ //! @} end member group
2160
+ //! @name Inclusive prefix scan operations (multiple data per thread)
2161
+ //! @{
2162
+
2163
+ //! @rst
2164
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2165
+ //! Each thread contributes an array of consecutive input elements.
2166
+ //!
2167
+ //! - Supports non-commutative scan operators.
2168
+ //! - @blocked
2169
+ //! - @granularity
2170
+ //! - @smemreuse
2171
+ //!
2172
+ //! Snippet
2173
+ //! +++++++
2174
+ //!
2175
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2176
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2177
+ //! where each thread owns 4 consecutive items.
2178
+ //!
2179
+ //! .. code-block:: c++
2180
+ //!
2181
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2182
+ //!
2183
+ //! __global__ void ExampleKernel(...)
2184
+ //! {
2185
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2186
+ //! using BlockScan = cub::BlockScan<int, 128>;
2187
+ //!
2188
+ //! // Allocate shared memory for BlockScan
2189
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2190
+ //!
2191
+ //! // Obtain a segment of consecutive items that are blocked across threads
2192
+ //! int thread_data[4];
2193
+ //! ...
2194
+ //!
2195
+ //! // Collectively compute the block-wide inclusive prefix max scan
2196
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
2197
+ //!
2198
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2199
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2200
+ //! The corresponding output ``thread_data`` in those threads will be
2201
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2202
+ //!
2203
+ //! @endrst
2204
+ //!
2205
+ //! @tparam ITEMS_PER_THREAD
2206
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2207
+ //!
2208
+ //! @tparam ScanOp
2209
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2210
+ //!
2211
+ //! @param[in] input
2212
+ //! Calling thread's input items
2213
+ //!
2214
+ //! @param[out] output
2215
+ //! Calling thread's output items (may be aliased to `input`)
2216
+ //!
2217
+ //! @param[in] scan_op
2218
+ //! Binary scan functor
2219
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2220
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2221
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
2222
+ {
2223
+ if (ITEMS_PER_THREAD == 1)
2224
+ {
2225
+ InclusiveScan(input[0], output[0], scan_op);
2226
+ }
2227
+ else
2228
+ {
2229
+ // Reduce consecutive thread items in registers
2230
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2231
+
2232
+ // Exclusive thread block-scan
2233
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op);
2234
+
2235
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2236
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2237
+ }
2238
+ }
2239
+
2240
+ //! @rst
2241
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2242
+ //! Each thread contributes an array of consecutive input elements.
2243
+ //!
2244
+ //! - Supports non-commutative scan operators.
2245
+ //! - @blocked
2246
+ //! - @granularity
2247
+ //! - @smemreuse
2248
+ //!
2249
+ //! Snippet
2250
+ //! +++++++
2251
+ //!
2252
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2253
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2254
+ //! where each thread owns 2 consecutive items.
2255
+ //!
2256
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2257
+ //! :language: c++
2258
+ //! :dedent:
2259
+ //! :start-after: example-begin inclusive-scan-array-init-value
2260
+ //! :end-before: example-end inclusive-scan-array-init-value
2261
+ //!
2262
+ //!
2263
+ //! @endrst
2264
+ //!
2265
+ //! @tparam ITEMS_PER_THREAD
2266
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2267
+ //!
2268
+ //! @tparam ScanOp
2269
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2270
+ //!
2271
+ //! @param[in] input
2272
+ //! Calling thread's input items
2273
+ //!
2274
+ //! @param[out] output
2275
+ //! Calling thread's output items (may be aliased to `input`)
2276
+ //!
2277
+ //! @param[in] initial_value
2278
+ //! Initial value to seed the inclusive scan (uniform across block)
2279
+ //!
2280
+ //! @param[in] scan_op
2281
+ //! Binary scan functor
2282
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2283
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2284
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
2285
+ {
2286
+ // Reduce consecutive thread items in registers
2287
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2288
+
2289
+ // Exclusive thread block-scan
2290
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
2291
+
2292
+ // Exclusive scan in registers with prefix as seed
2293
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2294
+ }
2295
+
2296
+ //! @rst
2297
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2298
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2299
+ //! with the block-wide ``block_aggregate`` of all inputs.
2300
+ //!
2301
+ //! - Supports non-commutative scan operators.
2302
+ //! - @blocked
2303
+ //! - @granularity
2304
+ //! - @smemreuse
2305
+ //!
2306
+ //! Snippet
2307
+ //! +++++++
2308
+ //!
2309
+ //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
2310
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
2311
+ //! where each thread owns 4 consecutive items.
2312
+ //!
2313
+ //! .. code-block:: c++
2314
+ //!
2315
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2316
+ //!
2317
+ //! __global__ void ExampleKernel(...)
2318
+ //! {
2319
+ //! // Specialize BlockScan for a 1D block of 128 threads of type int
2320
+ //! using BlockScan = cub::BlockScan<int, 128>;
2321
+ //!
2322
+ //! // Allocate shared memory for BlockScan
2323
+ //! __shared__ typename BlockScan::TempStorage temp_storage;
2324
+ //!
2325
+ //! // Obtain a segment of consecutive items that are blocked across threads
2326
+ //! int thread_data[4];
2327
+ //! ...
2328
+ //!
2329
+ //! // Collectively compute the block-wide inclusive prefix max scan
2330
+ //! int block_aggregate;
2331
+ //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, block_aggregate);
2332
+ //!
2333
+ //! Suppose the set of input ``thread_data`` across the block of threads is
2334
+ //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
2335
+ //! The corresponding output ``thread_data`` in those threads will be
2336
+ //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
2337
+ //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
2338
+ //!
2339
+ //! @endrst
2340
+ //!
2341
+ //! @tparam ITEMS_PER_THREAD
2342
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2343
+ //!
2344
+ //! @tparam ScanOp
2345
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2346
+ //!
2347
+ //! @param[in] input
2348
+ //! Calling thread's input items
2349
+ //!
2350
+ //! @param[out] output
2351
+ //! Calling thread's output items (may be aliased to `input`)
2352
+ //!
2353
+ //! @param[in] scan_op
2354
+ //! Binary scan functor
2355
+ //!
2356
+ //! @param[out] block_aggregate
2357
+ //! Block-wide aggregate reduction of input items
2358
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2359
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
2360
+ InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
2361
+ {
2362
+ if (ITEMS_PER_THREAD == 1)
2363
+ {
2364
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2365
+ }
2366
+ else
2367
+ {
2368
+ // Reduce consecutive thread items in registers
2369
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2370
+
2371
+ // Exclusive thread block-scan (with no initial value)
2372
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
2373
+
2374
+ // Inclusive scan in registers with prefix as seed (first thread does not seed)
2375
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
2376
+ }
2377
+ }
2378
+
2379
+ //! @rst
2380
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2381
+ //! Each thread contributes an array of consecutive input elements. Also provides every thread
2382
+ //! with the block-wide ``block_aggregate`` of all inputs.
2383
+ //!
2384
+ //! - Supports non-commutative scan operators.
2385
+ //! - @blocked
2386
+ //! - @granularity
2387
+ //! - @smemreuse
2388
+ //!
2389
+ //! Snippet
2390
+ //! +++++++
2391
+ //!
2392
+ //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
2393
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
2394
+ //! where each thread owns 2 consecutive items.
2395
+ //!
2396
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
2397
+ //! :language: c++
2398
+ //! :dedent:
2399
+ //! :start-after: example-begin inclusive-scan-array-aggregate-init-value
2400
+ //! :end-before: example-end inclusive-scan-array-aggregate-init-value
2401
+ //!
2402
+ //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
2403
+ //!
2404
+ //! .. note::
2405
+ //!
2406
+ //! ``initial_value`` is not applied to the block-wide aggregate.
2407
+ //!
2408
+ //! @endrst
2409
+ //!
2410
+ //! @tparam ITEMS_PER_THREAD
2411
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2412
+ //!
2413
+ //! @tparam ScanOp
2414
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2415
+ //!
2416
+ //! @param[in] input
2417
+ //! Calling thread's input items
2418
+ //!
2419
+ //! @param[out] output
2420
+ //! Calling thread's output items (may be aliased to `input`)
2421
+ //!
2422
+ //! @param[in] initial_value
2423
+ //! Initial value to seed the inclusive scan (uniform across block). It is not taken
2424
+ //! into account for ``block_aggregate``.
2425
+ //!
2426
+ //! @param[in] scan_op
2427
+ //! Binary scan functor
2428
+ //!
2429
+ //! @param[out] block_aggregate
2430
+ //! Block-wide aggregate reduction of input items
2431
+ template <int ITEMS_PER_THREAD, typename ScanOp>
2432
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2433
+ T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
2434
+ {
2435
+ // Reduce consecutive thread items in registers
2436
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2437
+
2438
+ // Exclusive thread block-scan
2439
+ ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
2440
+
2441
+ // Exclusive scan in registers with prefix as seed
2442
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2443
+ }
2444
+
2445
+ //! @rst
2446
+ //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
2447
+ //! Each thread contributes an array of consecutive input elements.
2448
+ //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
2449
+ //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
2450
+ //! thread block's scan inputs.
2451
+ //!
2452
+ //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
2453
+ //! The functor will be invoked by the first warp of threads in the block, however only the return value
2454
+ //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
2455
+ //! - Supports non-commutative scan operators.
2456
+ //! - @blocked
2457
+ //! - @granularity
2458
+ //! - @smemreuse
2459
+ //!
2460
+ //! Snippet
2461
+ //! +++++++
2462
+ //!
2463
+ //! The code snippet below illustrates a single thread block that progressively
2464
+ //! computes an inclusive prefix max scan over multiple "tiles" of input using a
2465
+ //! prefix functor to maintain a running total between block-wide scans. Each tile consists
2466
+ //! of 128 integer items that are partitioned across 128 threads.
2467
+ //!
2468
+ //! .. code-block:: c++
2469
+ //!
2470
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
2471
+ //!
2472
+ //! // A stateful callback functor that maintains a running prefix to be applied
2473
+ //! // during consecutive scan operations.
2474
+ //! struct BlockPrefixCallbackOp
2475
+ //! {
2476
+ //! // Running prefix
2477
+ //! int running_total;
2478
+ //!
2479
+ //! // Constructor
2480
+ //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
2481
+ //!
2482
+ //! // Callback operator to be entered by the first warp of threads in the block.
2483
+ //! // Thread-0 is responsible for returning a value for seeding the block-wide scan.
2484
+ //! __device__ int operator()(int block_aggregate)
2485
+ //! {
2486
+ //! int old_prefix = running_total;
2487
+ //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
2488
+ //! return old_prefix;
2489
+ //! }
2490
+ //! };
2491
+ //!
2492
+ //! __global__ void ExampleKernel(int *d_data, int num_items, ...)
2493
+ //! {
2494
+ //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
2495
+ //! using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE> ;
2496
+ //! using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
2497
+ //! using BlockScan = cub::BlockScan<int, 128> ;
2498
+ //!
2499
+ //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
2500
+ //! __shared__ union {
2501
+ //! typename BlockLoad::TempStorage load;
2502
+ //! typename BlockScan::TempStorage scan;
2503
+ //! typename BlockStore::TempStorage store;
2504
+ //! } temp_storage;
2505
+ //!
2506
+ //! // Initialize running total
2507
+ //! BlockPrefixCallbackOp prefix_op(0);
2508
+ //!
2509
+ //! // Have the block iterate over segments of items
2510
+ //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
2511
+ //! {
2512
+ //! // Load a segment of consecutive items that are blocked across threads
2513
+ //! int thread_data[4];
2514
+ //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
2515
+ //! __syncthreads();
2516
+ //!
2517
+ //! // Collectively compute the block-wide inclusive prefix max scan
2518
+ //! BlockScan(temp_storage.scan).InclusiveScan(
2519
+ //! thread_data, thread_data, cuda::maximum<>{}, prefix_op);
2520
+ //! __syncthreads();
2521
+ //!
2522
+ //! // Store scanned items to output segment
2523
+ //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
2524
+ //! __syncthreads();
2525
+ //! }
2526
+ //!
2527
+ //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
2528
+ //! The corresponding output for the first segment will be
2529
+ //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
2530
+ //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
2531
+ //!
2532
+ //! @endrst
2533
+ //!
2534
+ //! @tparam ITEMS_PER_THREAD
2535
+ //! **[inferred]** The number of consecutive items partitioned onto each thread.
2536
+ //!
2537
+ //! @tparam ScanOp
2538
+ //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
2539
+ //!
2540
+ //! @tparam BlockPrefixCallbackOp
2541
+ //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
2542
+ //!
2543
+ //! @param[in] input
2544
+ //! Calling thread's input items
2545
+ //!
2546
+ //! @param[out] output
2547
+ //! Calling thread's output items (may be aliased to `input`)
2548
+ //!
2549
+ //! @param[in] scan_op
2550
+ //! Binary scan functor
2551
+ //!
2552
+ //! @param[in,out] block_prefix_callback_op
2553
+ //! @rst
2554
+ //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
2555
+ //! the logical input sequence.
2556
+ //! @endrst
2557
+ template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
2558
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
2559
+ T (&input)[ITEMS_PER_THREAD],
2560
+ T (&output)[ITEMS_PER_THREAD],
2561
+ ScanOp scan_op,
2562
+ BlockPrefixCallbackOp& block_prefix_callback_op)
2563
+ {
2564
+ if (ITEMS_PER_THREAD == 1)
2565
+ {
2566
+ InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
2567
+ }
2568
+ else
2569
+ {
2570
+ // Reduce consecutive thread items in registers
2571
+ T thread_prefix = cub::ThreadReduce(input, scan_op);
2572
+
2573
+ // Exclusive thread block-scan
2574
+ ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
2575
+
2576
+ // Inclusive scan in registers with prefix as seed
2577
+ detail::ThreadScanInclusive(input, output, scan_op, thread_prefix);
2578
+ }
2579
+ }
2580
+
2581
+ //! @} end member group
2582
+ };
2583
+
2584
+ CUB_NAMESPACE_END