cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1863 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! @rst
7
+ //! The ``cub::WarpScan`` class provides :ref:`collective <collective-primitives>` methods for
8
+ //! computing a parallel prefix scan of items partitioned across a CUDA thread warp.
9
+ //! @endrst
10
+
11
+ #pragma once
12
+
13
+ #include <cub/config.cuh>
14
+
15
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16
+ # pragma GCC system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18
+ # pragma clang system_header
19
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20
+ # pragma system_header
21
+ #endif // no system header
22
+
23
+ #include <cub/thread/thread_operators.cuh>
24
+ #include <cub/util_type.cuh>
25
+ #include <cub/warp/specializations/warp_scan_shfl.cuh>
26
+ #include <cub/warp/specializations/warp_scan_smem.cuh>
27
+
28
+ #include <cuda/__ptx/instructions/get_sreg.h>
29
+ #include <cuda/std/__functional/operations.h>
30
+ #include <cuda/std/__type_traits/conditional.h>
31
+
32
+ CUB_NAMESPACE_BEGIN
33
+
34
+ //! @rst
35
+ //! The WarpScan class provides :ref:`collective <collective-primitives>` methods for computing a
36
+ //! parallel prefix scan of items partitioned across a CUDA thread warp.
37
+ //!
38
+ //! .. image:: ../../img/warp_scan_logo.png
39
+ //! :align: center
40
+ //!
41
+ //! Overview
42
+ //! ++++++++++++++++++++++++++
43
+ //!
44
+ //! * Given a list of input elements and a binary reduction operator, a
45
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`__ produces an output list where each
46
+ //! element is computed to be the reduction of the elements occurring earlier in the input list.
47
+ //! *Prefix sum* connotes a prefix scan with the addition operator. The term *inclusive*
48
+ //! indicates that the *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
49
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
50
+ //! the *i*\ :sup:`th` output reduction.
51
+ //! * Supports non-commutative scan operators
52
+ //! * Supports "logical" warps smaller than the physical warp size
53
+ //! (e.g., a logical warp of 8 threads)
54
+ //! * The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
55
+ //!
56
+ //! Performance Considerations
57
+ //! ++++++++++++++++++++++++++
58
+ //!
59
+ //! * Uses special instructions when applicable (e.g., warp ``SHFL``)
60
+ //! * Uses synchronization-free communication between warp lanes when applicable
61
+ //! * Incurs zero bank conflicts for most types
62
+ //! * Computation is slightly more efficient (i.e., having lower instruction overhead) for:
63
+ //!
64
+ //! * Summation (**vs.** generic scan)
65
+ //! * The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
66
+ //!
67
+ //! Simple Examples
68
+ //! ++++++++++++++++++++++++++
69
+ //!
70
+ //! @warpcollective{WarpScan}
71
+ //!
72
+ //! The code snippet below illustrates four concurrent warp prefix sums within a block of
73
+ //! 128 threads (one per each of the 32-thread warps).
74
+ //!
75
+ //! .. code-block:: c++
76
+ //!
77
+ //! #include <cub/cub.cuh>
78
+ //!
79
+ //! __global__ void ExampleKernel(...)
80
+ //! {
81
+ //! // Specialize WarpScan for type int
82
+ //! using WarpScan = cub::WarpScan<int>;
83
+ //!
84
+ //! // Allocate WarpScan shared memory for 4 warps
85
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
86
+ //!
87
+ //! // Obtain one input item per thread
88
+ //! int thread_data = ...
89
+ //!
90
+ //! // Compute warp-wide prefix sums
91
+ //! int warp_id = threadIdx.x / 32;
92
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
93
+ //! }
94
+ //!
95
+ //! Suppose the set of input ``thread_data`` across the block of threads is
96
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
97
+ //! threads will be ``0, 1, 2, 3, ..., 31}``.
98
+ //!
99
+ //! The code snippet below illustrates a single warp prefix sum within a block of
100
+ //! 128 threads.
101
+ //!
102
+ //! .. code-block:: c++
103
+ //!
104
+ //! #include <cub/cub.cuh>
105
+ //!
106
+ //! __global__ void ExampleKernel(...)
107
+ //! {
108
+ //! // Specialize WarpScan for type int
109
+ //! using WarpScan = cub::WarpScan<int>;
110
+ //!
111
+ //! // Allocate WarpScan shared memory for one warp
112
+ //! __shared__ typename WarpScan::TempStorage temp_storage;
113
+ //! ...
114
+ //!
115
+ //! // Only the first warp performs a prefix sum
116
+ //! if (threadIdx.x < 32)
117
+ //! {
118
+ //! // Obtain one input item per thread
119
+ //! int thread_data = ...
120
+ //!
121
+ //! // Compute warp-wide prefix sums
122
+ //! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
123
+ //! }
124
+ //! }
125
+ //!
126
+ //! Suppose the set of input ``thread_data`` across the warp of threads is
127
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
128
+ //! ``{0, 1, 2, 3, ..., 31}``.
129
+ //! @endrst
130
+ //!
131
+ //! @tparam T
132
+ //! The scan input/output element type
133
+ //!
134
+ //! @tparam LOGICAL_WARP_THREADS
135
+ //! **[optional]** The number of threads per "logical" warp (may be less than the number of
136
+ //! hardware warp threads). Default is the warp size associated with the CUDA Compute Capability
137
+ //! targeted by the compiler (e.g., 32 threads for SM20).
138
+ //!
139
+ template <typename T, int LOGICAL_WARP_THREADS = detail::warp_threads>
140
+ class WarpScan
141
+ {
142
+ private:
143
+ /******************************************************************************
144
+ * Constants and type definitions
145
+ ******************************************************************************/
146
+
147
+ /// Whether the logical warp size and the PTX warp size coincide
148
+ static constexpr bool IS_ARCH_WARP = (LOGICAL_WARP_THREADS == detail::warp_threads);
149
+
150
+ /// Whether the logical warp size is a power-of-two
151
+ static constexpr bool IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0);
152
+
153
+ /// Whether the data type is an integer (which has fully-associative addition)
154
+ static constexpr bool IS_INTEGER = cuda::std::is_integral_v<T>;
155
+
156
+ /// Internal specialization.
157
+ /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
158
+ using InternalWarpScan = ::cuda::std::
159
+ _If<IS_POW_OF_TWO, detail::WarpScanShfl<T, LOGICAL_WARP_THREADS>, detail::WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
160
+
161
+ /// Shared memory storage layout type for WarpScan
162
+ using _TempStorage = typename InternalWarpScan::TempStorage;
163
+
164
+ /******************************************************************************
165
+ * Thread fields
166
+ ******************************************************************************/
167
+
168
+ /// Shared storage reference
169
+ _TempStorage& temp_storage;
170
+ unsigned int lane_id;
171
+
172
+ /******************************************************************************
173
+ * Public types
174
+ ******************************************************************************/
175
+
176
+ public:
177
+ /// @smemstorage{WarpScan}
178
+ struct TempStorage : Uninitialized<_TempStorage>
179
+ {};
180
+
181
+ //! @name Collective constructors
182
+ //! @{
183
+
184
+ //! @brief Collective constructor using the specified memory allocation as temporary storage.
185
+ //! Logical warp and lane identifiers are constructed from `threadIdx.x`.
186
+ //!
187
+ //! @param[in] temp_storage
188
+ //! Reference to memory allocation having layout type TempStorage
189
+ _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
190
+ : temp_storage(temp_storage.Alias())
191
+ , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : ::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS)
192
+ {}
193
+
194
+ //! @} end member group
195
+ //! @name Inclusive prefix sums
196
+ //! @{
197
+
198
+ //! @rst
199
+ //! Computes an inclusive prefix sum across the calling warp.
200
+ //!
201
+ //! * @smemwarpreuse
202
+ //!
203
+ //! Snippet
204
+ //! +++++++
205
+ //!
206
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
207
+ //! block of 128 threads (one per each of the 32-thread warps).
208
+ //!
209
+ //! .. code-block:: c++
210
+ //!
211
+ //! #include <cub/cub.cuh>
212
+ //!
213
+ //! __global__ void ExampleKernel(...)
214
+ //! {
215
+ //! // Specialize WarpScan for type int
216
+ //! using WarpScan = cub::WarpScan<int>;
217
+ //!
218
+ //! // Allocate WarpScan shared memory for 4 warps
219
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
220
+ //!
221
+ //! // Obtain one input item per thread
222
+ //! int thread_data = ...
223
+ //!
224
+ //! // Compute inclusive warp-wide prefix sums
225
+ //! int warp_id = threadIdx.x / 32;
226
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
227
+ //! }
228
+ //!
229
+ //! Suppose the set of input ``thread_data`` across the block of threads is
230
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
231
+ //! of threads will be ``1, 2, 3, ..., 32}``.
232
+ //! @endrst
233
+ //!
234
+ //! @param[in] input
235
+ //! Calling thread's input item.
236
+ //!
237
+ //! @param[out] inclusive_output
238
+ //! Calling thread's output item. May be aliased with `input`.
239
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output)
240
+ {
241
+ InclusiveScan(input, inclusive_output, ::cuda::std::plus<>{});
242
+ }
243
+
244
+ //! @rst
245
+ //! Computes an inclusive prefix sum across the calling warp.
246
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
247
+ //!
248
+ //! * @smemwarpreuse
249
+ //!
250
+ //! Snippet
251
+ //! +++++++
252
+ //!
253
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
254
+ //! block of 128 threads (one per each of the 32-thread warps).
255
+ //!
256
+ //! .. code-block:: c++
257
+ //!
258
+ //! #include <cub/cub.cuh>
259
+ //!
260
+ //! __global__ void ExampleKernel(...)
261
+ //! {
262
+ //! // Specialize WarpScan for type int
263
+ //! using WarpScan = cub::WarpScan<int>;
264
+ //!
265
+ //! // Allocate WarpScan shared memory for 4 warps
266
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
267
+ //!
268
+ //! // Obtain one input item per thread
269
+ //! int thread_data = ...
270
+ //!
271
+ //! // Compute inclusive warp-wide prefix sums
272
+ //! int warp_aggregate;
273
+ //! int warp_id = threadIdx.x / 32;
274
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
275
+ //! }
276
+ //!
277
+ //! Suppose the set of input ``thread_data`` across the block of threads is
278
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
279
+ //! of threads will be ``1, 2, 3, ..., 32}``. Furthermore, ``warp_aggregate`` for all threads
280
+ //! in all warps will be ``32``.
281
+ //! @endrst
282
+ //!
283
+ //! @param[in] input
284
+ //! Calling thread's input item
285
+ //!
286
+ //! @param[out] inclusive_output
287
+ //! Calling thread's output item. May be aliased with `input`
288
+ //!
289
+ //! @param[out] warp_aggregate
290
+ //! Warp-wide aggregate reduction of input items
291
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output, T& warp_aggregate)
292
+ {
293
+ InclusiveScan(input, inclusive_output, ::cuda::std::plus<>{}, warp_aggregate);
294
+ }
295
+
296
+ //! @} end member group
297
+ //! @name Exclusive prefix sums
298
+ //! @{
299
+
300
+ //! @rst
301
+ //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
302
+ //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
303
+ //!
304
+ //! * @identityzero
305
+ //! * @smemwarpreuse
306
+ //!
307
+ //! Snippet
308
+ //! +++++++
309
+ //!
310
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
311
+ //! block of 128 threads (one per each of the 32-thread warps).
312
+ //!
313
+ //! .. code-block:: c++
314
+ //!
315
+ //! #include <cub/cub.cuh>
316
+ //!
317
+ //! __global__ void ExampleKernel(...)
318
+ //! {
319
+ //! // Specialize WarpScan for type int
320
+ //! using WarpScan = cub::WarpScan<int>;
321
+ //!
322
+ //! // Allocate WarpScan shared memory for 4 warps
323
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
324
+ //!
325
+ //! // Obtain one input item per thread
326
+ //! int thread_data = ...
327
+ //!
328
+ //! // Compute exclusive warp-wide prefix sums
329
+ //! int warp_id = threadIdx.x / 32;
330
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
331
+ //! }
332
+ //!
333
+ //! Suppose the set of input ``thread_data`` across the block of threads is
334
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
335
+ //! of threads will be ``0, 1, 2, ..., 31}``.
336
+ //! @endrst
337
+ //!
338
+ //! @param[in] input
339
+ //! Calling thread's input item.
340
+ //!
341
+ //! @param[out] exclusive_output
342
+ //! Calling thread's output item. May be aliased with `input`.
343
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output)
344
+ {
345
+ T initial_value{};
346
+ ExclusiveScan(input, exclusive_output, initial_value, ::cuda::std::plus<>{});
347
+ }
348
+
349
+ //! @rst
350
+ //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
351
+ //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
352
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
353
+ //!
354
+ //! * @identityzero
355
+ //! * @smemwarpreuse
356
+ //!
357
+ //! Snippet
358
+ //! +++++++
359
+ //!
360
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
361
+ //! block of 128 threads (one per each of the 32-thread warps).
362
+ //!
363
+ //! .. code-block:: c++
364
+ //!
365
+ //! #include <cub/cub.cuh>
366
+ //!
367
+ //! __global__ void ExampleKernel(...)
368
+ //! {
369
+ //! // Specialize WarpScan for type int
370
+ //! using WarpScan = cub::WarpScan<int>;
371
+ //!
372
+ //! // Allocate WarpScan shared memory for 4 warps
373
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
374
+ //!
375
+ //! // Obtain one input item per thread
376
+ //! int thread_data = ...
377
+ //!
378
+ //! // Compute exclusive warp-wide prefix sums
379
+ //! int warp_aggregate;
380
+ //! int warp_id = threadIdx.x / 32;
381
+ //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data,
382
+ //! thread_data,
383
+ //! warp_aggregate);
384
+ //!
385
+ //! Suppose the set of input ``thread_data`` across the block of threads is
386
+ //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
387
+ //! of threads will be ``0, 1, 2, ..., 31}``. Furthermore, ``warp_aggregate`` for all threads
388
+ //! in all warps will be ``32``.
389
+ //! @endrst
390
+ //!
391
+ //!
392
+ //! @param[in] input
393
+ //! Calling thread's input item
394
+ //!
395
+ //! @param[out] exclusive_output
396
+ //! Calling thread's output item. May be aliased with `input`
397
+ //!
398
+ //! @param[out] warp_aggregate
399
+ //! Warp-wide aggregate reduction of input items
400
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output, T& warp_aggregate)
401
+ {
402
+ T initial_value{};
403
+ ExclusiveScan(input, exclusive_output, initial_value, ::cuda::std::plus<>{}, warp_aggregate);
404
+ }
405
+
406
+ //! @} end member group
407
+ //! @name Inclusive prefix scans
408
+ //! @{
409
+
410
+ //! @rst
411
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
412
+ //! calling warp.
413
+ //!
414
+ //! * @smemwarpreuse
415
+ //!
416
+ //! Snippet
417
+ //! +++++++
418
+ //!
419
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
420
+ //! within a block of 128 threads (one per each of the 32-thread warps).
421
+ //!
422
+ //! .. code-block:: c++
423
+ //!
424
+ //! #include <cub/cub.cuh>
425
+ //!
426
+ //! __global__ void ExampleKernel(...)
427
+ //! {
428
+ //! // Specialize WarpScan for type int
429
+ //! using WarpScan = cub::WarpScan<int>;
430
+ //!
431
+ //! // Allocate WarpScan shared memory for 4 warps
432
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
433
+ //!
434
+ //! // Obtain one input item per thread
435
+ //! int thread_data = ...
436
+ //!
437
+ //! // Compute inclusive warp-wide prefix max scans
438
+ //! int warp_id = threadIdx.x / 32;
439
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
440
+ //!
441
+ //! Suppose the set of input ``thread_data`` across the block of threads is
442
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
443
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
444
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc.
445
+ //! @endrst
446
+ //!
447
+ //! @tparam ScanOp
448
+ //! **[inferred]** Binary scan operator type having member
449
+ //! `T operator()(const T &a, const T &b)`
450
+ //!
451
+ //! @param[in] input
452
+ //! Calling thread's input item
453
+ //!
454
+ //! @param[out] inclusive_output
455
+ //! Calling thread's output item. May be aliased with `input`
456
+ //!
457
+ //! @param[in] scan_op
458
+ //! Binary scan operator
459
+ template <typename ScanOp>
460
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op)
461
+ {
462
+ InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
463
+ }
464
+
465
+ //! @rst
466
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
467
+ //! calling warp.
468
+ //!
469
+ //! * @smemwarpreuse
470
+ //!
471
+ //! Snippet
472
+ //! +++++++
473
+ //!
474
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sum scans
475
+ //! within a block of 128 threads (one per each of the 32-thread warps).
476
+ //!
477
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
478
+ //! :language: c++
479
+ //! :dedent:
480
+ //! :start-after: example-begin inclusive-warp-scan-init-value
481
+ //! :end-before: example-end inclusive-warp-scan-init-value
482
+ //!
483
+ //! Suppose the set of input ``thread_data`` in the first warp is
484
+ //! ``{0, 1, 2, 3, ..., 31}``, in the second warp is ``{1, 2, 3, 4, ..., 32}`` etc.
485
+ //! The corresponding output ``thread_data`` for a max operation in the first
486
+ //! warp would be ``{3, 3, 3, 3, ..., 31}``, the output for the second warp would be
487
+ //! ``{3, 3, 3, 4, ..., 32}``, etc.
488
+ //! @endrst
489
+ //!
490
+ //! @tparam ScanOp
491
+ //! **[inferred]** Binary scan operator type having member
492
+ //! `T operator()(const T &a, const T &b)`
493
+ //!
494
+ //! @param[in] input
495
+ //! Calling thread's input item
496
+ //!
497
+ //! @param[out] inclusive_output
498
+ //! Calling thread's output item. May be aliased with `input`
499
+ //!
500
+ //! @param[in] initial_value
501
+ //! Initial value to seed the inclusive scan (uniform across warp)
502
+ //!
503
+ //! @param[in] scan_op
504
+ //! Binary scan operator
505
+ template <typename ScanOp>
506
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op)
507
+ {
508
+ InternalWarpScan internal(temp_storage);
509
+
510
+ T exclusive_output;
511
+ internal.InclusiveScan(input, inclusive_output, scan_op);
512
+
513
+ internal.Update(
514
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
515
+ }
516
+
517
+ //! @rst
518
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
519
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
520
+ //! all inputs.
521
+ //!
522
+ //! * @smemwarpreuse
523
+ //!
524
+ //! Snippet
525
+ //! +++++++
526
+ //!
527
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
528
+ //! within a block of 128 threads (one per each of the 32-thread warps).
529
+ //!
530
+ //! .. code-block:: c++
531
+ //!
532
+ //! #include <cub/cub.cuh>
533
+ //!
534
+ //! __global__ void ExampleKernel(...)
535
+ //! {
536
+ //! // Specialize WarpScan for type int
537
+ //! using WarpScan = cub::WarpScan<int>;
538
+ //!
539
+ //! // Allocate WarpScan shared memory for 4 warps
540
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
541
+ //!
542
+ //! // Obtain one input item per thread
543
+ //! int thread_data = ...
544
+ //!
545
+ //! // Compute inclusive warp-wide prefix max scans
546
+ //! int warp_aggregate;
547
+ //! int warp_id = threadIdx.x / 32;
548
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(
549
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_aggregate);
550
+ //!
551
+ //! Suppose the set of input ``thread_data`` across the block of threads is
552
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
553
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
554
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned
555
+ //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
556
+ //! @endrst
557
+ //!
558
+ //! @tparam ScanOp
559
+ //! **[inferred]** Binary scan operator type having member
560
+ //! `T operator()(const T &a, const T &b)`
561
+ //!
562
+ //! @param[in] input
563
+ //! Calling thread's input item
564
+ //!
565
+ //! @param[out] inclusive_output
566
+ //! Calling thread's output item. May be aliased with ``input``
567
+ //!
568
+ //! @param[in] scan_op
569
+ //! Binary scan operator
570
+ //!
571
+ //! @param[out] warp_aggregate
572
+ //! Warp-wide aggregate reduction of input items.
573
+ template <typename ScanOp>
574
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate)
575
+ {
576
+ InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
577
+ }
578
+
579
+ //! @rst
580
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
581
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
582
+ //! all inputs.
583
+ //!
584
+ //! * @smemwarpreuse
585
+ //!
586
+ //! Snippet
587
+ //! +++++++
588
+ //!
589
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
590
+ //! within a block of 128 threads (one scan per warp).
591
+ //!
592
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
593
+ //! :language: c++
594
+ //! :dedent:
595
+ //! :start-after: example-begin inclusive-warp-scan-init-value-aggregate
596
+ //! :end-before: example-end inclusive-warp-scan-init-value-aggregate
597
+ //!
598
+ //! Suppose the set of input ``thread_data`` across the block of threads is
599
+ //! ``{1, 1, 1, 1, ..., 1}``. For initial value equal to 3, the corresponding output
600
+ //! ``thread_data`` for a sum operation in the first warp would be
601
+ //! ``{4, 5, 6, 7, ..., 35}``, the output for the second warp would be
602
+ //! ``{4, 5, 6, 7, ..., 35}``, etc. Furthermore, ``warp_aggregate`` would be assigned
603
+ //! ``32`` for threads in each warp.
604
+ //! @endrst
605
+ //!
606
+ //! @tparam ScanOp
607
+ //! **[inferred]** Binary scan operator type having member
608
+ //! `T operator()(const T &a, const T &b)`
609
+ //!
610
+ //! @param[in] input
611
+ //! Calling thread's input item
612
+ //!
613
+ //! @param[out] inclusive_output
614
+ //! Calling thread's output item. May be aliased with ``input``
615
+ //!
616
+ //! @param[in] initial_value
617
+ //! Initial value to seed the inclusive scan (uniform across warp). It is not taken
618
+ //! into account for warp_aggregate.
619
+ //!
620
+ //! @param[in] scan_op
621
+ //! Binary scan operator
622
+ //!
623
+ //! @param[out] warp_aggregate
624
+ //! Warp-wide aggregate reduction of input items.
625
+ template <typename ScanOp>
626
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
627
+ InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
628
+ {
629
+ InternalWarpScan internal(temp_storage);
630
+
631
+ // Perform the inclusive scan operation
632
+ internal.InclusiveScan(input, inclusive_output, scan_op);
633
+
634
+ // Update the inclusive_output and warp_aggregate using the Update function
635
+ T exclusive_output;
636
+ internal.Update(
637
+ input,
638
+ inclusive_output,
639
+ exclusive_output,
640
+ warp_aggregate,
641
+ scan_op,
642
+ initial_value,
643
+ detail::bool_constant_v<IS_INTEGER>);
644
+ }
645
+
646
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial inclusive scans
647
+ //! @rst
648
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
649
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
650
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
651
+
652
+ //!
653
+ //! * @smemwarpreuse
654
+ //!
655
+ //! Snippet
656
+ //! +++++++
657
+ //!
658
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
659
+ //! within a block of 128 threads (one per each of the 32-thread warps).
660
+ //!
661
+ //! .. code-block:: c++
662
+ //!
663
+ //! #include <cub/cub.cuh>
664
+ //!
665
+ //! __global__ void ExampleKernel(...)
666
+ //! {
667
+ //! // Specialize WarpScan for type int
668
+ //! using WarpScan = cub::WarpScan<int>;
669
+ //!
670
+ //! // Allocate WarpScan shared memory for 4 warps
671
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
672
+ //!
673
+ //! // Obtain one input item per thread
674
+ //! int thread_data = ...
675
+ //! int warp_id = threadIdx.x / 32;
676
+ //! int block_valid_items = 35;
677
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
678
+ //!
679
+ //! // Compute inclusive warp-wide prefix max scans
680
+ //! WarpScan(temp_storage[warp_id]).InclusiveScanPartial(
681
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items);
682
+ //!
683
+ //! Suppose the set of input ``thread_data`` across the block of threads is
684
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
685
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
686
+ //! ``32, 32, 34, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
687
+ //! @endrst
688
+ //!
689
+ //! @tparam ScanOp
690
+ //! **[inferred]** Binary scan operator type having member
691
+ //! `T operator()(const T &a, const T &b)`
692
+ //!
693
+ //! @param[in] input
694
+ //! Calling thread's input item
695
+ //!
696
+ //! @param[out] inclusive_output
697
+ //! Calling thread's output item. May be aliased with `input`
698
+ //!
699
+ //! @param[in] scan_op
700
+ //! Binary scan operator
701
+ //!
702
+ //! @param[in] valid_items
703
+ //! Number of valid items in warp
704
+ template <typename ScanOp>
705
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScanPartial(T input, T& inclusive_output, ScanOp scan_op, int valid_items)
706
+ {
707
+ InternalWarpScan(temp_storage).InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
708
+ }
709
+
710
+ //! @rst
711
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
712
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
713
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
714
+
715
+ //!
716
+ //! * @smemwarpreuse
717
+ //!
718
+ //! Snippet
719
+ //! +++++++
720
+ //!
721
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sum scans
722
+ //! within a block of 128 threads (one per each of the 32-thread warps).
723
+ //!
724
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_partial_api.cu
725
+ //! :language: c++
726
+ //! :dedent:
727
+ //! :start-after: example-begin inclusive-warp-scan-init-value-partial
728
+ //! :end-before: example-end inclusive-warp-scan-init-value-partial
729
+ //!
730
+ //! Suppose the set of input ``thread_data`` in the first warp is
731
+ //! ``{0, -1, 2, -3, ..., 28, -29, 30, -31}``, in the second warp is ``{1, -2, 3, -4, ..., 29, -30, 31, -32}`` etc.
732
+ //! The corresponding output ``thread_data`` for a max operation in the first
733
+ //! warp would be ``{3, 3, 3, 3, ..., 28, 28, 30, 30}``, the output for the second warp would be
734
+ //! ``{3, 3, 3, 3, ..., 29, 29, 31, -32}``, etc.
735
+ //! @endrst
736
+ //!
737
+ //! @tparam ScanOp
738
+ //! **[inferred]** Binary scan operator type having member
739
+ //! `T operator()(const T &a, const T &b)`
740
+ //!
741
+ //! @param[in] input
742
+ //! Calling thread's input item
743
+ //!
744
+ //! @param[out] inclusive_output
745
+ //! Calling thread's output item. May be aliased with `input`
746
+ //!
747
+ //! @param[in] initial_value
748
+ //! Initial value to seed the inclusive scan (uniform across warp)
749
+ //!
750
+ //! @param[in] scan_op
751
+ //! Binary scan operator
752
+ //!
753
+ //! @param[in] valid_items
754
+ //! Number of valid items in warp
755
+ template <typename ScanOp>
756
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
757
+ InclusiveScanPartial(T input, T& inclusive_output, T initial_value, ScanOp scan_op, int valid_items)
758
+ {
759
+ InternalWarpScan internal(temp_storage);
760
+
761
+ T exclusive_output;
762
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
763
+
764
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
765
+ }
766
+
767
+ //! @rst
768
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
769
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
770
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
771
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
772
+ //! inputs, the aggregate is undefined.
773
+
774
+ //!
775
+ //! * @smemwarpreuse
776
+ //!
777
+ //! Snippet
778
+ //! +++++++
779
+ //!
780
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
781
+ //! within a block of 128 threads (one per each of the 32-thread warps).
782
+ //!
783
+ //! .. code-block:: c++
784
+ //!
785
+ //! #include <cub/cub.cuh>
786
+ //!
787
+ //! __global__ void ExampleKernel(...)
788
+ //! {
789
+ //! // Specialize WarpScan for type int
790
+ //! using WarpScan = cub::WarpScan<int>;
791
+ //!
792
+ //! // Allocate WarpScan shared memory for 4 warps
793
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
794
+ //!
795
+ //! // Obtain one input item per thread
796
+ //! int thread_data = ...
797
+ //! int warp_id = threadIdx.x / 32;
798
+ //! int block_valid_items = 35;
799
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
800
+ //!
801
+ //! // Compute inclusive warp-wide prefix max scans
802
+ //! int warp_aggregate;
803
+ //! WarpScan(temp_storage[warp_id]).InclusiveScan(
804
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
805
+ //!
806
+ //! Suppose the set of input ``thread_data`` across the block of threads is
807
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
808
+ //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
809
+ //! ``32, 32, 34, -35, ..., 62, -63`` and the output in the third and fourth warps would remain
810
+ //! unmodified. Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in
811
+ //! the first warp, ``34`` for threads in the second warp, and undefined for the third and
812
+ //! fourth warps.
813
+ //! @endrst
814
+ //!
815
+ //! @tparam ScanOp
816
+ //! **[inferred]** Binary scan operator type having member
817
+ //! `T operator()(const T &a, const T &b)`
818
+ //!
819
+ //! @param[in] input
820
+ //! Calling thread's input item
821
+ //!
822
+ //! @param[out] inclusive_output
823
+ //! Calling thread's output item. May be aliased with ``input``
824
+ //!
825
+ //! @param[in] scan_op
826
+ //! Binary scan operator
827
+ //!
828
+ //! @param[in] valid_items
829
+ //! Number of valid items in warp
830
+ //!
831
+ //! @param[out] warp_aggregate
832
+ //! Warp-wide aggregate reduction of input items.
833
+ template <typename ScanOp>
834
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
835
+ InclusiveScanPartial(T input, T& inclusive_output, ScanOp scan_op, int valid_items, T& warp_aggregate)
836
+ {
837
+ InternalWarpScan(temp_storage).InclusiveScanPartial(input, inclusive_output, scan_op, valid_items, warp_aggregate);
838
+ }
839
+
840
+ //! @rst
841
+ //! Computes an inclusive prefix scan using the specified binary scan functor across the
842
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
843
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
844
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
845
+ //! inputs, the aggregate is undefined.
846
+
847
+ //!
848
+ //! * @smemwarpreuse
849
+ //!
850
+ //! Snippet
851
+ //! +++++++
852
+ //!
853
+ //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
854
+ //! within a block of 128 threads (one scan per warp).
855
+ //!
856
+ //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
857
+ //! :language: c++
858
+ //! :dedent:
859
+ //! :start-after: example-begin inclusive-warp-scan-init-value-aggregate-partial
860
+ //! :end-before: example-end inclusive-warp-scan-init-value-aggregate-partial
861
+ //!
862
+ //! Suppose the set of input ``thread_data`` in the first warp is
863
+ //! ``{0, 0, 0, 1, ..., 1}``, in the second warp is ``{0, 0, 1, ..., 1}`` etc.
864
+ //! For initial value equal to 3, the corresponding output
865
+ //! ``thread_data`` for a sum operation in the first warp would be
866
+ //! ``{3, 3, 3, 4, ..., 29, 30, 31, 32}``, the output for the second warp would be
867
+ //! ``{3, 3, 4, 5, ..., 30, 31, 32, 1}``, etc. Furthermore, ``warp_aggregate`` would be assigned
868
+ //! ``29`` for threads in the first warp, ``30`` for the threads in the second warp, etc.
869
+ //! @endrst
870
+ //!
871
+ //! @tparam ScanOp
872
+ //! **[inferred]** Binary scan operator type having member
873
+ //! `T operator()(const T &a, const T &b)`
874
+ //!
875
+ //! @param[in] input
876
+ //! Calling thread's input item
877
+ //!
878
+ //! @param[out] inclusive_output
879
+ //! Calling thread's output item. May be aliased with ``input``
880
+ //!
881
+ //! @param[in] initial_value
882
+ //! Initial value to seed the inclusive scan (uniform across warp). It is not taken
883
+ //! into account for warp_aggregate.
884
+ //!
885
+ //! @param[in] scan_op
886
+ //! Binary scan operator
887
+ //!
888
+ //! @param[in] valid_items
889
+ //! Number of valid items in warp
890
+ //!
891
+ //! @param[out] warp_aggregate
892
+ //! Warp-wide aggregate reduction of input items.
893
+ template <typename ScanOp>
894
+ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScanPartial(
895
+ T input, T& inclusive_output, T initial_value, ScanOp scan_op, int valid_items, T& warp_aggregate)
896
+ {
897
+ InternalWarpScan internal(temp_storage);
898
+
899
+ // Perform the inclusive scan operation
900
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
901
+
902
+ // Update the inclusive_output and warp_aggregate using the Update function
903
+ T exclusive_output;
904
+ internal.UpdatePartial(
905
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items, initial_value);
906
+ }
907
+
908
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial inclusive scans
909
+
910
+ //! @} end member group
911
+ //! @name Exclusive prefix scans
912
+ //! @{
913
+
914
+ //! @rst
915
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
916
+ //! calling warp. Because no initial value is supplied, the ``output`` computed for
917
+ //! *lane*\ :sub:`0` is undefined.
918
+ //!
919
+ //! * @smemwarpreuse
920
+ //!
921
+ //! Snippet
922
+ //! +++++++
923
+ //!
924
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
925
+ //! within a block of 128 threads (one per each of the 32-thread warps).
926
+ //!
927
+ //! .. code-block:: c++
928
+ //!
929
+ //! #include <cub/cub.cuh>
930
+ //!
931
+ //! __global__ void ExampleKernel(...)
932
+ //! {
933
+ //! // Specialize WarpScan for type int
934
+ //! using WarpScan = cub::WarpScan<int>;
935
+ //!
936
+ //! // Allocate WarpScan shared memory for 4 warps
937
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
938
+ //!
939
+ //! // Obtain one input item per thread
940
+ //! int thread_data = ...
941
+ //!
942
+ //! // Compute exclusive warp-wide prefix max scans
943
+ //! int warp_id = threadIdx.x / 32;
944
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cuda::maximum<>{});
945
+ //!
946
+ //! Suppose the set of input ``thread_data`` across the block of threads is
947
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
948
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
949
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc.
950
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
951
+ //! @endrst
952
+ //!
953
+ //! @tparam ScanOp
954
+ //! **[inferred]** Binary scan operator type having member
955
+ //! `T operator()(const T &a, const T &b)`
956
+ //!
957
+ //! @param[in] input
958
+ //! Calling thread's input item
959
+ //!
960
+ //! @param[out] exclusive_output
961
+ //! Calling thread's output item. May be aliased with `input`
962
+ //!
963
+ //! @param[in] scan_op
964
+ //! Binary scan operator
965
+ template <typename ScanOp>
966
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op)
967
+ {
968
+ InternalWarpScan internal(temp_storage);
969
+
970
+ T inclusive_output;
971
+ internal.InclusiveScan(input, inclusive_output, scan_op);
972
+
973
+ internal.Update(input, inclusive_output, exclusive_output, scan_op, detail::bool_constant_v<IS_INTEGER>);
974
+ }
975
+
976
+ //! @rst
977
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
978
+ //! calling warp.
979
+ //!
980
+ //! * @smemwarpreuse
981
+ //!
982
+ //! Snippet
983
+ //! +++++++
984
+ //!
985
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
986
+ //! within a block of 128 threads (one per each of the 32-thread warps).
987
+ //!
988
+ //! .. code-block:: c++
989
+ //!
990
+ //! #include <cub/cub.cuh>
991
+ //!
992
+ //! __global__ void ExampleKernel(...)
993
+ //! {
994
+ //! // Specialize WarpScan for type int
995
+ //! using WarpScan = cub::WarpScan<int>;
996
+ //!
997
+ //! // Allocate WarpScan shared memory for 4 warps
998
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
999
+ //!
1000
+ //! // Obtain one input item per thread
1001
+ //! int thread_data = ...
1002
+ //!
1003
+ //! // Compute exclusive warp-wide prefix max scans
1004
+ //! int warp_id = threadIdx.x / 32;
1005
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1006
+ //! thread_data,
1007
+ //! INT_MIN,
1008
+ //! cuda::maximum<>{});
1009
+ //!
1010
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1011
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1012
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1013
+ //! ``30, 32, 32, 34, ..., 60, 62``, etc.
1014
+ //! @endrst
1015
+ //!
1016
+ //! @tparam ScanOp
1017
+ //! **[inferred]** Binary scan operator type having member
1018
+ //! `T operator()(const T &a, const T &b)`
1019
+ //!
1020
+ //! @param[in] input
1021
+ //! Calling thread's input item
1022
+ //!
1023
+ //! @param[out] exclusive_output
1024
+ //! Calling thread's output item. May be aliased with `input`
1025
+ //!
1026
+ //! @param[in] initial_value
1027
+ //! Initial value to seed the exclusive scan
1028
+ //!
1029
+ //! @param[in] scan_op
1030
+ //! Binary scan operator
1031
+ template <typename ScanOp>
1032
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op)
1033
+ {
1034
+ InternalWarpScan internal(temp_storage);
1035
+
1036
+ T inclusive_output;
1037
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1038
+
1039
+ internal.Update(
1040
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
1041
+ }
1042
+
1043
+ //! @rst
1044
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1045
+ //! calling warp. Because no initial value is supplied, the ``output`` computed for
1046
+ //! *lane*\ :sub:`0` is undefined. Also provides every thread with the warp-wide
1047
+ //! ``warp_aggregate`` of all inputs.
1048
+ //!
1049
+ //! * @smemwarpreuse
1050
+ //!
1051
+ //! Snippet
1052
+ //! +++++++
1053
+ //!
1054
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1055
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1056
+ //!
1057
+ //! .. code-block:: c++
1058
+ //!
1059
+ //! #include <cub/cub.cuh>
1060
+ //!
1061
+ //! __global__ void ExampleKernel(...)
1062
+ //! {
1063
+ //! // Specialize WarpScan for type int
1064
+ //! using WarpScan = cub::WarpScan<int>;
1065
+ //!
1066
+ //! // Allocate WarpScan shared memory for 4 warps
1067
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1068
+ //!
1069
+ //! // Obtain one input item per thread
1070
+ //! int thread_data = ...
1071
+ //!
1072
+ //! // Compute exclusive warp-wide prefix max scans
1073
+ //! int warp_aggregate;
1074
+ //! int warp_id = threadIdx.x / 32;
1075
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1076
+ //! thread_data,
1077
+ //! cuda::maximum<>{},
1078
+ //! warp_aggregate);
1079
+ //!
1080
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1081
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1082
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1083
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc. (The output ``thread_data`` in warp *lane*\ :sub:`0`
1084
+ //! is undefined). Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in the
1085
+ //! first warp, \p 62 for threads in the second warp, etc.
1086
+ //! @endrst
1087
+ //!
1088
+ //! @tparam ScanOp
1089
+ //! **[inferred]** Binary scan operator type having member
1090
+ //! `T operator()(const T &a, const T &b)`
1091
+ //!
1092
+ //! @param[in] input
1093
+ //! Calling thread's input item
1094
+ //!
1095
+ //! @param[out] exclusive_output
1096
+ //! Calling thread's output item. May be aliased with `input`
1097
+ //!
1098
+ //! @param[in] scan_op
1099
+ //! Binary scan operator
1100
+ //!
1101
+ //! @param[out] warp_aggregate
1102
+ //! Warp-wide aggregate reduction of input items
1103
+ template <typename ScanOp>
1104
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& warp_aggregate)
1105
+ {
1106
+ InternalWarpScan internal(temp_storage);
1107
+
1108
+ T inclusive_output;
1109
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1110
+
1111
+ internal.Update(
1112
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, detail::bool_constant_v<IS_INTEGER>);
1113
+ }
1114
+
1115
+ //! @rst
1116
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1117
+ //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
1118
+ //! all inputs.
1119
+ //!
1120
+ //! * @smemwarpreuse
1121
+ //!
1122
+ //! Snippet
1123
+ //! +++++++
1124
+ //!
1125
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1126
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1127
+ //!
1128
+ //! .. code-block:: c++
1129
+ //!
1130
+ //! #include <cub/cub.cuh>
1131
+ //!
1132
+ //! __global__ void ExampleKernel(...)
1133
+ //! {
1134
+ //! // Specialize WarpScan for type int
1135
+ //! using WarpScan = cub::WarpScan<int>;
1136
+ //!
1137
+ //! // Allocate WarpScan shared memory for 4 warps
1138
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1139
+ //!
1140
+ //! // Obtain one input item per thread
1141
+ //! int thread_data = ...
1142
+ //!
1143
+ //! // Compute exclusive warp-wide prefix max scans
1144
+ //! int warp_aggregate;
1145
+ //! int warp_id = threadIdx.x / 32;
1146
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
1147
+ //! thread_data,
1148
+ //! INT_MIN,
1149
+ //! cuda::maximum<>{},
1150
+ //! warp_aggregate);
1151
+ //!
1152
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1153
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1154
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1155
+ //! ``INT_MIN, 32, 32, 34, ..., 60, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned
1156
+ //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
1157
+ //! @endrst
1158
+ //!
1159
+ //! @tparam ScanOp
1160
+ //! **[inferred]** Binary scan operator type having member
1161
+ //! `T operator()(const T &a, const T &b)`
1162
+ //!
1163
+ //! @param[in] input
1164
+ //! Calling thread's input item
1165
+ //!
1166
+ //! @param[out] exclusive_output
1167
+ //! Calling thread's output item. May be aliased with `input`
1168
+ //!
1169
+ //! @param[in] initial_value
1170
+ //! Initial value to seed the exclusive scan
1171
+ //!
1172
+ //! @param[in] scan_op
1173
+ //! Binary scan operator
1174
+ //!
1175
+ //! @param[out] warp_aggregate
1176
+ //! Warp-wide aggregate reduction of input items
1177
+ //!
1178
+ template <typename ScanOp>
1179
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1180
+ ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
1181
+ {
1182
+ InternalWarpScan internal(temp_storage);
1183
+
1184
+ T inclusive_output;
1185
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1186
+
1187
+ internal.Update(
1188
+ input,
1189
+ inclusive_output,
1190
+ exclusive_output,
1191
+ warp_aggregate,
1192
+ scan_op,
1193
+ initial_value,
1194
+ detail::bool_constant_v<IS_INTEGER>);
1195
+ }
1196
+
1197
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial exclusive scans
1198
+ //! @rst
1199
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1200
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1201
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1202
+ //! Because no initial value is supplied, the ``output`` computed for
1203
+ //! *lane*\ :sub:`0` is undefined.
1204
+ //!
1205
+ //! * @smemwarpreuse
1206
+ //!
1207
+ //! Snippet
1208
+ //! +++++++
1209
+ //!
1210
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1211
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1212
+ //!
1213
+ //! .. code-block:: c++
1214
+ //!
1215
+ //! #include <cub/cub.cuh>
1216
+ //!
1217
+ //! __global__ void ExampleKernel(...)
1218
+ //! {
1219
+ //! // Specialize WarpScan for type int
1220
+ //! using WarpScan = cub::WarpScan<int>;
1221
+ //!
1222
+ //! // Allocate WarpScan shared memory for 4 warps
1223
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1224
+ //!
1225
+ //! // Obtain one input item per thread
1226
+ //! int thread_data = ...
1227
+ //! int warp_id = threadIdx.x / 32;
1228
+ //! int block_valid_items = 35;
1229
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1230
+ //!
1231
+ //! // Compute exclusive warp-wide prefix max scans
1232
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1233
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items);
1234
+ //!
1235
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1236
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1237
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1238
+ //! ``?, 32, 32, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
1239
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1240
+ //! @endrst
1241
+ //!
1242
+ //! @tparam ScanOp
1243
+ //! **[inferred]** Binary scan operator type having member
1244
+ //! `T operator()(const T &a, const T &b)`
1245
+ //!
1246
+ //! @param[in] input
1247
+ //! Calling thread's input item
1248
+ //!
1249
+ //! @param[out] exclusive_output
1250
+ //! Calling thread's output item. May be aliased with `input`
1251
+ //!
1252
+ //! @param[in] scan_op
1253
+ //! Binary scan operator
1254
+ //!
1255
+ //! @param[in] valid_items
1256
+ //! Number of valid items in warp
1257
+ template <typename ScanOp>
1258
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScanPartial(T input, T& exclusive_output, ScanOp scan_op, int valid_items)
1259
+ {
1260
+ InternalWarpScan internal(temp_storage);
1261
+
1262
+ T inclusive_output;
1263
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1264
+
1265
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items);
1266
+ }
1267
+
1268
+ //! @rst
1269
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1270
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1271
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1272
+
1273
+ //!
1274
+ //! * @smemwarpreuse
1275
+ //!
1276
+ //! Snippet
1277
+ //! +++++++
1278
+ //!
1279
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1280
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1281
+ //!
1282
+ //! .. code-block:: c++
1283
+ //!
1284
+ //! #include <cub/cub.cuh>
1285
+ //!
1286
+ //! __global__ void ExampleKernel(...)
1287
+ //! {
1288
+ //! // Specialize WarpScan for type int
1289
+ //! using WarpScan = cub::WarpScan<int>;
1290
+ //!
1291
+ //! // Allocate WarpScan shared memory for 4 warps
1292
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1293
+ //!
1294
+ //! // Obtain one input item per thread
1295
+ //! int thread_data = ...
1296
+ //! int warp_id = threadIdx.x / 32;
1297
+ //! int block_valid_items = 35;
1298
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1299
+ //!
1300
+ //! // Compute exclusive warp-wide prefix max scans
1301
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1302
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, warp_valid_items);
1303
+ //!
1304
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1305
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1306
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1307
+ //! ``30, 32, 32, -35, ..., 62, -63`` and the output in the third and fourth warps would remain unmodified.
1308
+ //! @endrst
1309
+ //!
1310
+ //! @tparam ScanOp
1311
+ //! **[inferred]** Binary scan operator type having member
1312
+ //! `T operator()(const T &a, const T &b)`
1313
+ //!
1314
+ //! @param[in] input
1315
+ //! Calling thread's input item
1316
+ //!
1317
+ //! @param[out] exclusive_output
1318
+ //! Calling thread's output item. May be aliased with `input`
1319
+ //!
1320
+ //! @param[in] initial_value
1321
+ //! Initial value to seed the exclusive scan
1322
+ //!
1323
+ //! @param[in] scan_op
1324
+ //! Binary scan operator
1325
+ //!
1326
+ //! @param[in] valid_items
1327
+ //! Number of valid items in warp
1328
+ template <typename ScanOp>
1329
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1330
+ ExclusiveScanPartial(T input, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items)
1331
+ {
1332
+ InternalWarpScan internal(temp_storage);
1333
+
1334
+ T inclusive_output;
1335
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1336
+
1337
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
1338
+ }
1339
+
1340
+ //! @rst
1341
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1342
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1343
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1344
+ //! Because no initial value is supplied, the ``output`` computed for *lane*\ :sub:`0` is undefined.
1345
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
1346
+ //! inputs, the aggregate is undefined.
1347
+ //!
1348
+ //! * @smemwarpreuse
1349
+ //!
1350
+ //! Snippet
1351
+ //! +++++++
1352
+ //!
1353
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1354
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1355
+ //!
1356
+ //! .. code-block:: c++
1357
+ //!
1358
+ //! #include <cub/cub.cuh>
1359
+ //!
1360
+ //! __global__ void ExampleKernel(...)
1361
+ //! {
1362
+ //! // Specialize WarpScan for type int
1363
+ //! using WarpScan = cub::WarpScan<int>;
1364
+ //!
1365
+ //! // Allocate WarpScan shared memory for 4 warps
1366
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1367
+ //!
1368
+ //! // Obtain one input item per thread
1369
+ //! int thread_data = ...
1370
+ //! int warp_id = threadIdx.x / 32;
1371
+ //! int block_valid_items = 35;
1372
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1373
+ //!
1374
+ //! // Compute exclusive warp-wide prefix max scans
1375
+ //! int warp_aggregate;
1376
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1377
+ //! thread_data, thread_data, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
1378
+ //!
1379
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1380
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1381
+ //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1382
+ //! ``?, 32, 32, -35, ..., 62, -63``, and the output in the third and fourth warps would remain unmodified
1383
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined). Furthermore, ``warp_aggregate``
1384
+ //! would be assigned ``30`` for threads in the first warp, ``34`` for threads in the second warp and
1385
+ //! undefined for the third and fourth warps.
1386
+ //! @endrst
1387
+ //!
1388
+ //! @tparam ScanOp
1389
+ //! **[inferred]** Binary scan operator type having member
1390
+ //! `T operator()(const T &a, const T &b)`
1391
+ //!
1392
+ //! @param[in] input
1393
+ //! Calling thread's input item
1394
+ //!
1395
+ //! @param[out] exclusive_output
1396
+ //! Calling thread's output item. May be aliased with `input`
1397
+ //!
1398
+ //! @param[in] scan_op
1399
+ //! Binary scan operator
1400
+ //!
1401
+ //! @param[in] valid_items
1402
+ //! Number of valid items in warp
1403
+ //!
1404
+ //! @param[out] warp_aggregate
1405
+ //! Warp-wide aggregate reduction of input items
1406
+ template <typename ScanOp>
1407
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1408
+ ExclusiveScanPartial(T input, T& exclusive_output, ScanOp scan_op, int valid_items, T& warp_aggregate)
1409
+ {
1410
+ InternalWarpScan internal(temp_storage);
1411
+
1412
+ T inclusive_output;
1413
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1414
+
1415
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items);
1416
+ }
1417
+
1418
+ //! @rst
1419
+ //! Computes an exclusive prefix scan using the specified binary scan functor across the
1420
+ //! calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1421
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1422
+ //! Also provides every thread with the warp-wide ``warp_aggregate`` of all valid inputs. If there are no valid
1423
+ //! inputs, the aggregate is undefined.
1424
+
1425
+ //!
1426
+ //! * @smemwarpreuse
1427
+ //!
1428
+ //! Snippet
1429
+ //! +++++++
1430
+ //!
1431
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1432
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1433
+ //!
1434
+ //! .. code-block:: c++
1435
+ //!
1436
+ //! #include <cub/cub.cuh>
1437
+ //!
1438
+ //! __global__ void ExampleKernel(...)
1439
+ //! {
1440
+ //! // Specialize WarpScan for type int
1441
+ //! using WarpScan = cub::WarpScan<int>;
1442
+ //!
1443
+ //! // Allocate WarpScan shared memory for 4 warps
1444
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1445
+ //!
1446
+ //! // Obtain one input item per thread
1447
+ //! int thread_data = ...
1448
+ //! int warp_id = threadIdx.x / 32;
1449
+ //! int block_valid_items = 35;
1450
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1451
+ //!
1452
+ //! // Compute exclusive warp-wide prefix max scans
1453
+ //! int warp_aggregate;
1454
+ //! WarpScan(temp_storage[warp_id]).ExclusiveScanPartial(
1455
+ //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, warp_valid_items, warp_aggregate);
1456
+ //!
1457
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1458
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
1459
+ //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1460
+ //! ``INT_MIN, 32, 32, -35, ..., 62, -63``, and the output in the third and fourth warps would
1461
+ //! remain unmodified. Furthermore, ``warp_aggregate`` would be assigned
1462
+ //! ``30`` for threads in the first warp, ``34`` for threads in the second warp and undefined
1463
+ //! for the third and fourth warps.
1464
+ //! @endrst
1465
+ //!
1466
+ //! @tparam ScanOp
1467
+ //! **[inferred]** Binary scan operator type having member
1468
+ //! `T operator()(const T &a, const T &b)`
1469
+ //!
1470
+ //! @param[in] input
1471
+ //! Calling thread's input item
1472
+ //!
1473
+ //! @param[out] exclusive_output
1474
+ //! Calling thread's output item. May be aliased with `input`
1475
+ //!
1476
+ //! @param[in] initial_value
1477
+ //! Initial value to seed the exclusive scan
1478
+ //!
1479
+ //! @param[in] scan_op
1480
+ //! Binary scan operator
1481
+ //!
1482
+ //! @param[in] valid_items
1483
+ //! Number of valid items in warp
1484
+ //!
1485
+ //! @param[out] warp_aggregate
1486
+ //! Warp-wide aggregate reduction of input items
1487
+ //!
1488
+ template <typename ScanOp>
1489
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScanPartial(
1490
+ T input, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items, T& warp_aggregate)
1491
+ {
1492
+ InternalWarpScan internal(temp_storage);
1493
+
1494
+ T inclusive_output;
1495
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1496
+
1497
+ internal.UpdatePartial(
1498
+ input, inclusive_output, exclusive_output, warp_aggregate, scan_op, valid_items, initial_value);
1499
+ }
1500
+
1501
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial exclusive scans
1502
+
1503
+ //! @} end member group
1504
+ //! @name Combination (inclusive & exclusive) prefix scans
1505
+ //! @{
1506
+
1507
+ //! @rst
1508
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1509
+ //! across the calling warp. Because no initial value is supplied, the ``exclusive_output``
1510
+ //! computed for *lane*\ :sub:`0` is undefined.
1511
+ //!
1512
+ //! * @smemwarpreuse
1513
+ //!
1514
+ //! Snippet
1515
+ //! +++++++
1516
+ //!
1517
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1518
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1519
+ //!
1520
+ //! .. code-block:: c++
1521
+ //!
1522
+ //! #include <cub/cub.cuh>
1523
+ //!
1524
+ //! __global__ void ExampleKernel(...)
1525
+ //! {
1526
+ //! // Specialize WarpScan for type int
1527
+ //! using WarpScan = cub::WarpScan<int>;
1528
+ //!
1529
+ //! // Allocate WarpScan shared memory for 4 warps
1530
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1531
+ //!
1532
+ //! // Obtain one input item per thread
1533
+ //! int thread_data = ...
1534
+ //!
1535
+ //! // Compute exclusive warp-wide prefix max scans
1536
+ //! int inclusive_partial, exclusive_partial;
1537
+ //! WarpScan(temp_storage[warp_id]).Scan(thread_data,
1538
+ //! inclusive_partial,
1539
+ //! exclusive_partial,
1540
+ //! cuda::maximum<>{});
1541
+ //!
1542
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1543
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1544
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1545
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
1546
+ //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1547
+ //! ``?, 32, 32, 34, ..., 60, 62``, etc.
1548
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1549
+ //! @endrst
1550
+ //!
1551
+ //! @tparam ScanOp
1552
+ //! **[inferred]** Binary scan operator type having member
1553
+ //! `T operator()(const T &a, const T &b)`
1554
+ //!
1555
+ //! @param[in] input
1556
+ //! Calling thread's input item
1557
+ //!
1558
+ //! @param[out] inclusive_output
1559
+ //! Calling thread's inclusive-scan output item
1560
+ //!
1561
+ //! @param[out] exclusive_output
1562
+ //! Calling thread's exclusive-scan output item
1563
+ //!
1564
+ //! @param[in] scan_op
1565
+ //! Binary scan operator
1566
+ template <typename ScanOp>
1567
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Scan(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op)
1568
+ {
1569
+ InternalWarpScan internal(temp_storage);
1570
+
1571
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1572
+
1573
+ internal.Update(input, inclusive_output, exclusive_output, scan_op, detail::bool_constant_v<IS_INTEGER>);
1574
+ }
1575
+
1576
+ //! @rst
1577
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1578
+ //! across the calling warp.
1579
+ //!
1580
+ //! * @smemwarpreuse
1581
+ //!
1582
+ //! Snippet
1583
+ //! +++++++
1584
+ //!
1585
+ //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a
1586
+ //! block of 128 threads (one per each of the 32-thread warps).
1587
+ //!
1588
+ //! .. code-block:: c++
1589
+ //!
1590
+ //! #include <cub/cub.cuh>
1591
+ //!
1592
+ //! __global__ void ExampleKernel(...)
1593
+ //! {
1594
+ //! // Specialize WarpScan for type int
1595
+ //! using WarpScan = cub::WarpScan<int>;
1596
+ //!
1597
+ //! // Allocate WarpScan shared memory for 4 warps
1598
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1599
+ //!
1600
+ //! // Obtain one input item per thread
1601
+ //! int thread_data = ...
1602
+ //!
1603
+ //! // Compute inclusive warp-wide prefix max scans
1604
+ //! int warp_id = threadIdx.x / 32;
1605
+ //! int inclusive_partial, exclusive_partial;
1606
+ //! WarpScan(temp_storage[warp_id]).Scan(thread_data,
1607
+ //! inclusive_partial,
1608
+ //! exclusive_partial,
1609
+ //! INT_MIN,
1610
+ //! cuda::maximum<>{});
1611
+ //!
1612
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1613
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1614
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1615
+ //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
1616
+ //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would
1617
+ //! be ``INT_MIN, 32, 32, 34, ..., 60, 62``, etc.
1618
+ //! @endrst
1619
+ //!
1620
+ //! @tparam ScanOp
1621
+ //! **[inferred]** Binary scan operator type having member
1622
+ //! `T operator()(const T &a, const T &b)`
1623
+ //!
1624
+ //! @param[in] input
1625
+ //! Calling thread's input item
1626
+ //!
1627
+ //! @param[out] inclusive_output
1628
+ //! Calling thread's inclusive-scan output item
1629
+ //!
1630
+ //! @param[out] exclusive_output
1631
+ //! Calling thread's exclusive-scan output item
1632
+ //!
1633
+ //! @param[in] initial_value
1634
+ //! Initial value to seed the exclusive scan
1635
+ //!
1636
+ //! @param[in] scan_op
1637
+ //! Binary scan operator
1638
+ template <typename ScanOp>
1639
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1640
+ Scan(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op)
1641
+ {
1642
+ InternalWarpScan internal(temp_storage);
1643
+
1644
+ internal.InclusiveScan(input, inclusive_output, scan_op);
1645
+
1646
+ internal.Update(
1647
+ input, inclusive_output, exclusive_output, scan_op, initial_value, detail::bool_constant_v<IS_INTEGER>);
1648
+ }
1649
+
1650
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document partial combined scans
1651
+ //! @rst
1652
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1653
+ //! across the calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1654
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1655
+ //! Because no initial value is supplied, the ``exclusive_output``
1656
+ //! computed for *lane*\ :sub:`0` is undefined.
1657
+ //!
1658
+ //! * @smemwarpreuse
1659
+ //!
1660
+ //! Snippet
1661
+ //! +++++++
1662
+ //!
1663
+ //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
1664
+ //! within a block of 128 threads (one per each of the 32-thread warps).
1665
+ //!
1666
+ //! .. code-block:: c++
1667
+ //!
1668
+ //! #include <cub/cub.cuh>
1669
+ //!
1670
+ //! __global__ void ExampleKernel(...)
1671
+ //! {
1672
+ //! // Specialize WarpScan for type int
1673
+ //! using WarpScan = cub::WarpScan<int>;
1674
+ //!
1675
+ //! // Allocate WarpScan shared memory for 4 warps
1676
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1677
+ //!
1678
+ //! // Obtain one input item per thread
1679
+ //! int thread_data = ...
1680
+ //! int warp_id = threadIdx.x / 32;
1681
+ //! int block_valid_items = 35;
1682
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1683
+ //!
1684
+ //! // Compute exclusive warp-wide prefix max scans
1685
+ //! int inclusive_partial, exclusive_partial;
1686
+ //! WarpScan(temp_storage[warp_id]).ScanPartial(
1687
+ //! thread_data, inclusive_partial, exclusive_partial, cuda::maximum<>{}, warp_valid_items);
1688
+ //!
1689
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1690
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1691
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1692
+ //! ``32, 32, 34, -35, ..., 62, -63``. The corresponding output ``exclusive_partial`` in the
1693
+ //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
1694
+ //! ``?, 32, 32, -35, ..., 62, -63``. The third and fourth warps ``inclusive_partial`` and
1695
+ //! ``exclusive_parttial`` would remain unmodified.
1696
+ //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
1697
+ //! @endrst
1698
+ //!
1699
+ //! @tparam ScanOp
1700
+ //! **[inferred]** Binary scan operator type having member
1701
+ //! `T operator()(const T &a, const T &b)`
1702
+ //!
1703
+ //! @param[in] input
1704
+ //! Calling thread's input item
1705
+ //!
1706
+ //! @param[out] inclusive_output
1707
+ //! Calling thread's inclusive-scan output item
1708
+ //!
1709
+ //! @param[out] exclusive_output
1710
+ //! Calling thread's exclusive-scan output item
1711
+ //!
1712
+ //! @param[in] scan_op
1713
+ //! Binary scan operator
1714
+ //!
1715
+ //! @param[in] valid_items
1716
+ //! Number of valid items in warp
1717
+ template <typename ScanOp>
1718
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1719
+ ScanPartial(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op, int valid_items)
1720
+ {
1721
+ InternalWarpScan internal(temp_storage);
1722
+
1723
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1724
+
1725
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items);
1726
+ }
1727
+
1728
+ //! @rst
1729
+ //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
1730
+ //! across the calling warp. But only the first ``valid_items`` elements (corresponding to warp lanes) are
1731
+ //! used in the calculation. The leftover invalid elements are never passed to the binary scan functor.
1732
+
1733
+ //!
1734
+ //! * @smemwarpreuse
1735
+ //!
1736
+ //! Snippet
1737
+ //! +++++++
1738
+ //!
1739
+ //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a
1740
+ //! block of 128 threads (one per each of the 32-thread warps).
1741
+ //!
1742
+ //! .. code-block:: c++
1743
+ //!
1744
+ //! #include <cub/cub.cuh>
1745
+ //!
1746
+ //! __global__ void ExampleKernel(...)
1747
+ //! {
1748
+ //! // Specialize WarpScan for type int
1749
+ //! using WarpScan = cub::WarpScan<int>;
1750
+ //!
1751
+ //! // Allocate WarpScan shared memory for 4 warps
1752
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1753
+ //!
1754
+ //! // Obtain one input item per thread
1755
+ //! int thread_data = ...
1756
+ //! int warp_id = threadIdx.x / 32;
1757
+ //! int block_valid_items = 35;
1758
+ //! int warp_valid_items = block_valid_items - warp_id * 32;
1759
+ //!
1760
+ //! // Compute inclusive warp-wide prefix max scans
1761
+ //! int inclusive_partial, exclusive_partial;
1762
+ //! WarpScan(temp_storage[warp_id]).ScanPartial(
1763
+ //! thread_data, inclusive_partial, exclusive_partial, INT_MIN, cuda::maximum<>{}, warp_valid_items);
1764
+ //!
1765
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1766
+ //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
1767
+ //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
1768
+ //! ``32, 32, 34, -35, ..., 62, -63``. The corresponding output ``exclusive_partial`` in the
1769
+ //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would
1770
+ //! be ``INT_MIN, 32, 32, -35, ..., 62, -63``. The third and fourth warps ``inclusive_partial`` and
1771
+ //! ``exclusive_parttial`` would remain unmodified.
1772
+
1773
+ //! @endrst
1774
+ //!
1775
+ //! @tparam ScanOp
1776
+ //! **[inferred]** Binary scan operator type having member
1777
+ //! `T operator()(const T &a, const T &b)`
1778
+ //!
1779
+ //! @param[in] input
1780
+ //! Calling thread's input item
1781
+ //!
1782
+ //! @param[out] inclusive_output
1783
+ //! Calling thread's inclusive-scan output item
1784
+ //!
1785
+ //! @param[out] exclusive_output
1786
+ //! Calling thread's exclusive-scan output item
1787
+ //!
1788
+ //! @param[in] initial_value
1789
+ //! Initial value to seed the exclusive scan
1790
+ //!
1791
+ //! @param[in] scan_op
1792
+ //! Binary scan operator
1793
+ //!
1794
+ //! @param[in] valid_items
1795
+ //! Number of valid items in warp
1796
+ template <typename ScanOp>
1797
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1798
+ ScanPartial(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op, int valid_items)
1799
+ {
1800
+ InternalWarpScan internal(temp_storage);
1801
+
1802
+ internal.InclusiveScanPartial(input, inclusive_output, scan_op, valid_items);
1803
+
1804
+ internal.UpdatePartial(input, inclusive_output, exclusive_output, scan_op, valid_items, initial_value);
1805
+ }
1806
+
1807
+ #endif // _CCCL_DOXYGEN_INVOKED // Do not document partial combined scans
1808
+
1809
+ //! @} end member group
1810
+ //! @name Data exchange
1811
+ //! @{
1812
+
1813
+ //! @rst
1814
+ //! Broadcast the value ``input`` from *lane*\ :sub:`src_lane` to all lanes in the warp
1815
+ //!
1816
+ //! * @smemwarpreuse
1817
+ //!
1818
+ //! Snippet
1819
+ //! +++++++
1820
+ //!
1821
+ //! The code snippet below illustrates the warp-wide broadcasts of values from *lane*\ :sub:`0`
1822
+ //! in each of four warps to all other threads in those warps.
1823
+ //!
1824
+ //! .. code-block:: c++
1825
+ //!
1826
+ //! #include <cub/cub.cuh>
1827
+ //!
1828
+ //! __global__ void ExampleKernel(...)
1829
+ //! {
1830
+ //! // Specialize WarpScan for type int
1831
+ //! using WarpScan = cub::WarpScan<int>;
1832
+ //!
1833
+ //! // Allocate WarpScan shared memory for 4 warps
1834
+ //! __shared__ typename WarpScan::TempStorage temp_storage[4];
1835
+ //!
1836
+ //! // Obtain one input item per thread
1837
+ //! int thread_data = ...
1838
+ //!
1839
+ //! // Broadcast from lane0 in each warp to all other threads in the warp
1840
+ //! int warp_id = threadIdx.x / 32;
1841
+ //! thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
1842
+ //!
1843
+ //! Suppose the set of input ``thread_data`` across the block of threads is
1844
+ //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``thread_data`` will be
1845
+ //! ``{0, 0, ..., 0}`` in warp\ :sub:`0`,
1846
+ //! ``{32, 32, ..., 32}`` in warp\ :sub:`1`,
1847
+ //! ``{64, 64, ..., 64}`` in warp\ :sub:`2`, etc.
1848
+ //! @endrst
1849
+ //!
1850
+ //! @param[in] input
1851
+ //! The value to broadcast
1852
+ //!
1853
+ //! @param[in] src_lane
1854
+ //! Which warp lane is to do the broadcasting
1855
+ _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane)
1856
+ {
1857
+ return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
1858
+ }
1859
+
1860
+ //@} end member group
1861
+ };
1862
+
1863
+ CUB_NAMESPACE_END