cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2152 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
7
+ //! items residing within device-accessible memory.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/detail/choose_offset.cuh>
22
+ #include <cub/detail/device_memory_resource.cuh>
23
+ #include <cub/detail/env_dispatch.cuh>
24
+ #include <cub/detail/temporary_storage.cuh>
25
+ #include <cub/device/dispatch/dispatch_scan.cuh>
26
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
27
+ #include <cub/thread/thread_operators.cuh>
28
+
29
+ #include <cuda/__execution/determinism.h>
30
+ #include <cuda/__execution/require.h>
31
+ #include <cuda/__execution/tune.h>
32
+ #include <cuda/__memory_resource/get_memory_resource.h>
33
+ #include <cuda/__stream/get_stream.h>
34
+ #include <cuda/std/__execution/env.h>
35
+ #include <cuda/std/__functional/invoke.h>
36
+
37
+ CUB_NAMESPACE_BEGIN
38
+
39
+ namespace detail::scan
40
+ {
41
+ struct get_tuning_query_t
42
+ {};
43
+
44
+ template <class Derived>
45
+ struct tuning
46
+ {
47
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr Derived query(const get_tuning_query_t&) const noexcept
48
+ {
49
+ return static_cast<const Derived&>(*this);
50
+ }
51
+ };
52
+
53
+ struct default_tuning : tuning<default_tuning>
54
+ {
55
+ template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
56
+ using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
57
+ };
58
+ } // namespace detail::scan
59
+
60
+ //! @rst
61
+ //! DeviceScan provides device-wide, parallel operations for computing a
62
+ //! prefix scan across a sequence of data items residing within
63
+ //! device-accessible memory.
64
+ //!
65
+ //! Overview
66
+ //! +++++++++++++++++++++++++++++++++++++++++++++
67
+ //!
68
+ //! Given a sequence of input elements and a binary reduction operator, a
69
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
70
+ //! sequence where each element is computed to be the reduction of the elements
71
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
72
+ //! with the addition operator. The term *inclusive* indicates that the
73
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
74
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
75
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
76
+ //! output sequences are the same, the scan is performed in-place.
77
+ //!
78
+ //! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
79
+ //! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
80
+ //!
81
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
82
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
83
+ //! only a single pass through the input data, as described in our 2016 technical
84
+ //! report [1]_. The central idea is to leverage a small, constant factor of
85
+ //! redundant work in order to overlap the latencies of global prefix
86
+ //! propagation with local computation. As such, our algorithm requires only
87
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
88
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
89
+ //!
90
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
91
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
92
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
93
+ //!
94
+ //! Usage Considerations
95
+ //! +++++++++++++++++++++++++++++++++++++++++++++
96
+ //!
97
+ //! @cdp_class{DeviceScan}
98
+ //!
99
+ //! Performance
100
+ //! +++++++++++++++++++++++++++++++++++++++++++++
101
+ //!
102
+ //! @linear_performance{prefix scan}
103
+ //!
104
+ //! @endrst
105
+ struct DeviceScan
106
+ {
107
+ //! @cond
108
+ template <typename TuningEnvT,
109
+ typename InputIteratorT,
110
+ typename OutputIteratorT,
111
+ typename ScanOpT,
112
+ typename InitValueT,
113
+ typename NumItemsT,
114
+ ::cuda::execution::determinism::__determinism_t Determinism,
115
+ ForceInclusive EnforceInclusive = ForceInclusive::No>
116
+ CUB_RUNTIME_FUNCTION static cudaError_t scan_impl_determinism(
117
+ void* d_temp_storage,
118
+ size_t& temp_storage_bytes,
119
+ InputIteratorT d_in,
120
+ OutputIteratorT d_out,
121
+ ScanOpT scan_op,
122
+ InitValueT init,
123
+ NumItemsT num_items,
124
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
125
+ cudaStream_t stream)
126
+ {
127
+ using scan_tuning_t = ::cuda::std::execution::
128
+ __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
129
+
130
+ // Unsigned integer type for global offsets
131
+ using offset_t = detail::choose_offset_t<NumItemsT>;
132
+
133
+ using accum_t =
134
+ ::cuda::std::__accumulator_t<ScanOpT,
135
+ cub::detail::it_value_t<InputIteratorT>,
136
+ ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
137
+ cub::detail::it_value_t<InputIteratorT>,
138
+ typename InitValueT::value_type>>;
139
+
140
+ using policy_t = typename scan_tuning_t::
141
+ template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
142
+
143
+ using dispatch_t =
144
+ DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
145
+
146
+ return dispatch_t::Dispatch(
147
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
148
+ }
149
+
150
+ template <typename InputIteratorT,
151
+ typename OutputIteratorT,
152
+ typename ScanOpT,
153
+ typename InitValueT,
154
+ typename NumItemsT,
155
+ typename EnvT>
156
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_impl_env(
157
+ InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init, NumItemsT num_items, EnvT env)
158
+ {
159
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
160
+ "Determinism should be used inside requires to have an effect.");
161
+
162
+ using requirements_t = ::cuda::std::execution::
163
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
164
+
165
+ using requested_determinism_t =
166
+ ::cuda::std::execution::__query_result_or_t<requirements_t,
167
+ ::cuda::execution::determinism::__get_determinism_t,
168
+ ::cuda::execution::determinism::run_to_run_t>;
169
+
170
+ // Static assert to reject gpu_to_gpu determinism since it's not implemented
171
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
172
+ "gpu_to_gpu determinism is not supported");
173
+
174
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::not_guaranteed_t>,
175
+ "not_guaranteed determinism is not supported");
176
+
177
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
178
+
179
+ // Dispatch with environment - handles all boilerplate
180
+ return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
181
+ using tuning_t = decltype(tuning);
182
+ return scan_impl_determinism<tuning_t>(
183
+ storage, bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream);
184
+ });
185
+ }
186
+ //! @endcond
187
+
188
+ //! @name Exclusive scans
189
+ //! @{
190
+
191
+ //! @rst
192
+ //! Computes a device-wide exclusive prefix sum.
193
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
194
+ //!
195
+ //! - Supports non-commutative sum operators.
196
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
197
+ //! addition of floating-point types). Results for pseudo-associative
198
+ //! operators may vary from run to run. Additional details can be found in
199
+ //! the @lookback description.
200
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
201
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
202
+ //! shall not overlap in any other way.
203
+ //! - @devicestorage
204
+ //!
205
+ //! Snippet
206
+ //! +++++++++++++++++++++++++++++++++++++++++++++
207
+ //!
208
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
209
+ //! device vector.
210
+ //!
211
+ //! .. code-block:: c++
212
+ //!
213
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
214
+ //!
215
+ //! // Declare, allocate, and initialize device-accessible pointers for
216
+ //! // input and output
217
+ //! int num_items; // e.g., 7
218
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
219
+ //! int *d_out; // e.g., [ , , , , , , ]
220
+ //! ...
221
+ //!
222
+ //! // Determine temporary device storage requirements
223
+ //! void *d_temp_storage = nullptr;
224
+ //! size_t temp_storage_bytes = 0;
225
+ //! cub::DeviceScan::ExclusiveSum(
226
+ //! d_temp_storage, temp_storage_bytes,
227
+ //! d_in, d_out, num_items);
228
+ //!
229
+ //! // Allocate temporary storage
230
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
231
+ //!
232
+ //! // Run exclusive prefix sum
233
+ //! cub::DeviceScan::ExclusiveSum(
234
+ //! d_temp_storage, temp_storage_bytes,
235
+ //! d_in, d_out, num_items);
236
+ //!
237
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
238
+ //!
239
+ //! @endrst
240
+ //!
241
+ //! @tparam InputIteratorT
242
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
243
+ //!
244
+ //! @tparam OutputIteratorT
245
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
246
+ //!
247
+ //! @tparam NumItemsT
248
+ //! **[inferred]** An integral type representing the number of input elements
249
+ //!
250
+ //! @param[in] d_temp_storage
251
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
252
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
253
+ //!
254
+ //! @param[in,out] temp_storage_bytes
255
+ //! Reference to size in bytes of `d_temp_storage` allocation
256
+ //!
257
+ //! @param[in] d_in
258
+ //! Random-access iterator to the input sequence of data items
259
+ //!
260
+ //! @param[out] d_out
261
+ //! Random-access iterator to the output sequence of data items
262
+ //!
263
+ //! @param[in] num_items
264
+ //! Total number of input items (i.e., the length of `d_in`)
265
+ //!
266
+ //! @param[in] stream
267
+ //! @rst
268
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
269
+ //! @endrst
270
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
271
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
272
+ void* d_temp_storage,
273
+ size_t& temp_storage_bytes,
274
+ InputIteratorT d_in,
275
+ OutputIteratorT d_out,
276
+ NumItemsT num_items,
277
+ cudaStream_t stream = 0)
278
+ {
279
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
280
+
281
+ // Unsigned integer type for global offsets
282
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
283
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
284
+
285
+ // Initial value
286
+ InitT init_value{};
287
+
288
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
289
+ Dispatch(d_temp_storage,
290
+ temp_storage_bytes,
291
+ d_in,
292
+ d_out,
293
+ ::cuda::std::plus<>{},
294
+ detail::InputValue<InitT>(init_value),
295
+ num_items,
296
+ stream);
297
+ }
298
+
299
+ //! @rst
300
+ //! Computes a device-wide exclusive prefix sum.
301
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
302
+ //!
303
+ //! - Supports non-commutative sum operators.
304
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
305
+ //! addition of floating-point types). Results for pseudo-associative
306
+ //! operators may vary from run to run. Additional details can be found in
307
+ //! the @lookback description.
308
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
309
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
310
+ //! shall not overlap in any other way.
311
+ //! - @devicestorage
312
+ //!
313
+ //! Preconditions
314
+ //! +++++++++++++
315
+ //!
316
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
317
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
318
+ //! shall not overlap in any other way.
319
+ //! - ``d_in`` and ``d_out`` must not be null pointers
320
+ //!
321
+ //! Snippet
322
+ //! +++++++++++++++++++++++++++++++++++++++++++++
323
+ //!
324
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
325
+ //! device vector of ``float`` data elements.
326
+ //!
327
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
328
+ //! :language: c++
329
+ //! :dedent:
330
+ //! :start-after: example-begin exclusive-sum-env-determinism
331
+ //! :end-before: example-end exclusive-sum-env-determinism
332
+ //!
333
+ //! @endrst
334
+ //!
335
+ //! @tparam InputIteratorT
336
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
337
+ //!
338
+ //! @tparam OutputIteratorT
339
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
340
+ //!
341
+ //! @tparam NumItemsT
342
+ //! **[inferred]** An integral type representing the number of input elements
343
+ //!
344
+ //! @tparam EnvT
345
+ //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
346
+ //!
347
+ //! @param[in] d_in
348
+ //! Random-access iterator to the input sequence of data items
349
+ //!
350
+ //! @param[out] d_out
351
+ //! Random-access iterator to the output sequence of data items
352
+ //!
353
+ //! @param[in] num_items
354
+ //! Total number of input items (i.e., the length of `d_in`)
355
+ //!
356
+ //! @param[in] env
357
+ //! @rst
358
+ //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
359
+ //! @endrst
360
+ template <typename InputIteratorT,
361
+ typename OutputIteratorT,
362
+ typename NumItemsT,
363
+ typename EnvT = ::cuda::std::execution::env<>>
364
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
365
+ ExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
366
+ {
367
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");
368
+
369
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
370
+ InitT init_value{};
371
+
372
+ return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<InitT>(init_value), num_items, env);
373
+ }
374
+
375
+ //! @rst
376
+ //! Computes a device-wide exclusive prefix sum in-place.
377
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
378
+ //!
379
+ //! - Supports non-commutative sum operators.
380
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
381
+ //! addition of floating-point types). Results for pseudo-associative
382
+ //! operators may vary from run to run. Additional details can be found in
383
+ //! the @lookback description.
384
+ //! - @devicestorage
385
+ //!
386
+ //! Snippet
387
+ //! +++++++++++++++++++++++++++++++++++++++++++++
388
+ //!
389
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
390
+ //! device vector.
391
+ //!
392
+ //! .. code-block:: c++
393
+ //!
394
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
395
+ //!
396
+ //! // Declare, allocate, and initialize device-accessible pointers for
397
+ //! // input and output
398
+ //! int num_items; // e.g., 7
399
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
400
+ //! ...
401
+ //!
402
+ //! // Determine temporary device storage requirements
403
+ //! void *d_temp_storage = nullptr;
404
+ //! size_t temp_storage_bytes = 0;
405
+ //! cub::DeviceScan::ExclusiveSum(
406
+ //! d_temp_storage, temp_storage_bytes,
407
+ //! d_data, num_items);
408
+ //!
409
+ //! // Allocate temporary storage
410
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
411
+ //!
412
+ //! // Run exclusive prefix sum
413
+ //! cub::DeviceScan::ExclusiveSum(
414
+ //! d_temp_storage, temp_storage_bytes,
415
+ //! d_data, num_items);
416
+ //!
417
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
418
+ //!
419
+ //! @endrst
420
+ //!
421
+ //! @tparam IteratorT
422
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
423
+ //!
424
+ //! @tparam NumItemsT
425
+ //! **[inferred]** An integral type representing the number of input elements
426
+ //!
427
+ //! @param[in] d_temp_storage
428
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
429
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
430
+ //!
431
+ //! @param[in,out] temp_storage_bytes
432
+ //! Reference to size in bytes of `d_temp_storage` allocation
433
+ //!
434
+ //! @param[in,out] d_data
435
+ //! Random-access iterator to the sequence of data items
436
+ //!
437
+ //! @param[in] num_items
438
+ //! Total number of input items (i.e., the length of `d_in`)
439
+ //!
440
+ //! @param[in] stream
441
+ //! @rst
442
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
443
+ //! @endrst
444
+ template <typename IteratorT, typename NumItemsT>
445
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
446
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
447
+ {
448
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
449
+ }
450
+
451
+ //! @rst
452
+ //! Computes a device-wide exclusive prefix scan using the specified
453
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
454
+ //! the initial value, and is assigned to ``*d_out``.
455
+ //!
456
+ //! - Supports non-commutative scan operators.
457
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
458
+ //! addition of floating-point types). Results for pseudo-associative
459
+ //! operators may vary from run to run. Additional details can be found in
460
+ //! the @lookback description.
461
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
462
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
463
+ //! shall not overlap in any other way.
464
+ //! - @devicestorage
465
+ //!
466
+ //! Snippet
467
+ //! +++++++++++++++++++++++++++++++++++++++++++++
468
+ //!
469
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
470
+ //!
471
+ //! .. code-block:: c++
472
+ //!
473
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
474
+ //! #include <cuda/std/climits> // for INT_MAX
475
+ //!
476
+ //! // CustomMin functor
477
+ //! struct CustomMin
478
+ //! {
479
+ //! template <typename T>
480
+ //! __host__ __device__ __forceinline__
481
+ //! T operator()(const T &a, const T &b) const {
482
+ //! return (b < a) ? b : a;
483
+ //! }
484
+ //! };
485
+ //!
486
+ //! // Declare, allocate, and initialize device-accessible pointers for
487
+ //! // input and output
488
+ //! int num_items; // e.g., 7
489
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
490
+ //! int *d_out; // e.g., [ , , , , , , ]
491
+ //! CustomMin min_op;
492
+ //! ...
493
+ //!
494
+ //! // Determine temporary device storage requirements for exclusive
495
+ //! // prefix scan
496
+ //! void *d_temp_storage = nullptr;
497
+ //! size_t temp_storage_bytes = 0;
498
+ //! cub::DeviceScan::ExclusiveScan(
499
+ //! d_temp_storage, temp_storage_bytes,
500
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
501
+ //!
502
+ //! // Allocate temporary storage for exclusive prefix scan
503
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
504
+ //!
505
+ //! // Run exclusive prefix min-scan
506
+ //! cub::DeviceScan::ExclusiveScan(
507
+ //! d_temp_storage, temp_storage_bytes,
508
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
509
+ //!
510
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
511
+ //!
512
+ //! @endrst
513
+ //!
514
+ //! @tparam InputIteratorT
515
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
516
+ //!
517
+ //! @tparam OutputIteratorT
518
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
519
+ //!
520
+ //! @tparam ScanOpT
521
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
522
+ //!
523
+ //! @tparam InitValueT
524
+ //! **[inferred]** Type of the `init_value`
525
+ //!
526
+ //! @tparam NumItemsT
527
+ //! **[inferred]** An integral type representing the number of input elements
528
+ //!
529
+ //! @param[in] d_temp_storage
530
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
531
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
532
+ //!
533
+ //! @param[in,out] temp_storage_bytes
534
+ //! Reference to size in bytes of `d_temp_storage` allocation
535
+ //!
536
+ //! @param[in] d_in
537
+ //! Random-access iterator to the input sequence of data items
538
+ //!
539
+ //! @param[out] d_out
540
+ //! Random-access iterator to the output sequence of data items
541
+ //!
542
+ //! @param[in] scan_op
543
+ //! Binary associative scan functor
544
+ //!
545
+ //! @param[in] init_value
546
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
547
+ //!
548
+ //! @param[in] num_items
549
+ //! Total number of input items (i.e., the length of `d_in`)
550
+ //!
551
+ //! @param[in] stream
552
+ //! @rst
553
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
554
+ //! @endrst
555
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
556
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
557
+ void* d_temp_storage,
558
+ size_t& temp_storage_bytes,
559
+ InputIteratorT d_in,
560
+ OutputIteratorT d_out,
561
+ ScanOpT scan_op,
562
+ InitValueT init_value,
563
+ NumItemsT num_items,
564
+ cudaStream_t stream = 0)
565
+ {
566
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
567
+
568
+ // Unsigned integer type for global offsets
569
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
570
+
571
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
572
+ d_temp_storage,
573
+ temp_storage_bytes,
574
+ d_in,
575
+ d_out,
576
+ scan_op,
577
+ detail::InputValue<InitValueT>(init_value),
578
+ num_items,
579
+ stream);
580
+ }
581
+
582
+ //! @rst
583
+ //! Computes a device-wide exclusive prefix scan using the specified
584
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
585
+ //! the initial value, and is assigned to ``*d_out``.
586
+ //!
587
+ //! - Supports non-commutative scan operators.
588
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
589
+ //! addition of floating-point types). Results for pseudo-associative
590
+ //! operators may vary from run to run. Additional details can be found in
591
+ //! the @lookback description.
592
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
593
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
594
+ //! shall not overlap in any other way.
595
+ //! - @devicestorage
596
+ //!
597
+ //! Snippet
598
+ //! +++++++++++++++++++++++++++++++++++++++++++++
599
+ //!
600
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
601
+ //! device vector of ``float`` data elements.
602
+ //!
603
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
604
+ //! :language: c++
605
+ //! :dedent:
606
+ //! :start-after: example-begin exclusive-scan-env-determinism
607
+ //! :end-before: example-end exclusive-scan-env-determinism
608
+ //!
609
+ //! @endrst
610
+ //!
611
+ //! @tparam InputIteratorT
612
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
613
+ //!
614
+ //! @tparam OutputIteratorT
615
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
616
+ //!
617
+ //! @tparam ScanOpT
618
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
619
+ //!
620
+ //! @tparam InitValueT
621
+ //! **[inferred]** Type of the `init_value`
622
+ //!
623
+ //! @tparam NumItemsT
624
+ //! **[inferred]** An integral type representing the number of input elements
625
+ //!
626
+ //! @tparam EnvT
627
+ //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`.
628
+ //!
629
+ //! @param[in] d_in
630
+ //! Random-access iterator to the input sequence of data items
631
+ //!
632
+ //! @param[out] d_out
633
+ //! Random-access iterator to the output sequence of data items
634
+ //!
635
+ //! @param[in] scan_op
636
+ //! Binary associative scan functor
637
+ //!
638
+ //! @param[in] init_value
639
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
640
+ //!
641
+ //! @param[in] num_items
642
+ //! Total number of input items (i.e., the length of `d_in`)
643
+ //!
644
+ //! @param[in] env
645
+ //! @rst
646
+ //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`.
647
+ //! @endrst
648
+ template <typename InputIteratorT,
649
+ typename OutputIteratorT,
650
+ typename ScanOpT,
651
+ typename InitValueT,
652
+ typename NumItemsT,
653
+ typename EnvT = ::cuda::std::execution::env<>>
654
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
655
+ InputIteratorT d_in,
656
+ OutputIteratorT d_out,
657
+ ScanOpT scan_op,
658
+ InitValueT init_value,
659
+ NumItemsT num_items,
660
+ EnvT env = {})
661
+ {
662
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScan");
663
+
664
+ return scan_impl_env(d_in, d_out, scan_op, detail::InputValue<InitValueT>(init_value), num_items, env);
665
+ }
666
+
667
+ //! @rst
668
+ //! Computes a device-wide exclusive prefix scan using the specified
669
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
670
+ //! the initial value, and is assigned to ``*d_data``.
671
+ //!
672
+ //! - Supports non-commutative scan operators.
673
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
674
+ //! addition of floating-point types). Results for pseudo-associative
675
+ //! operators may vary from run to run. Additional details can be found in
676
+ //! the @lookback description.
677
+ //! - @devicestorage
678
+ //!
679
+ //! Snippet
680
+ //! +++++++++++++++++++++++++++++++++++++++++++++
681
+ //!
682
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
683
+ //! ``int`` device vector:
684
+ //!
685
+ //! .. code-block:: c++
686
+ //!
687
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
688
+ //! #include <cuda/std/climits> // for INT_MAX
689
+ //!
690
+ //! // CustomMin functor
691
+ //! struct CustomMin
692
+ //! {
693
+ //! template <typename T>
694
+ //! __host__ __device__ __forceinline__
695
+ //! T operator()(const T &a, const T &b) const {
696
+ //! return (b < a) ? b : a;
697
+ //! }
698
+ //! };
699
+ //!
700
+ //! // Declare, allocate, and initialize device-accessible pointers for
701
+ //! // input and output
702
+ //! int num_items; // e.g., 7
703
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
704
+ //! CustomMin min_op;
705
+ //! ...
706
+ //!
707
+ //! // Determine temporary device storage requirements for exclusive
708
+ //! // prefix scan
709
+ //! void *d_temp_storage = nullptr;
710
+ //! size_t temp_storage_bytes = 0;
711
+ //! cub::DeviceScan::ExclusiveScan(
712
+ //! d_temp_storage, temp_storage_bytes,
713
+ //! d_data, min_op, (int) INT_MAX, num_items);
714
+ //!
715
+ //! // Allocate temporary storage for exclusive prefix scan
716
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
717
+ //!
718
+ //! // Run exclusive prefix min-scan
719
+ //! cub::DeviceScan::ExclusiveScan(
720
+ //! d_temp_storage, temp_storage_bytes,
721
+ //! d_data, min_op, (int) INT_MAX, num_items);
722
+ //!
723
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
724
+ //!
725
+ //! @endrst
726
+ //!
727
+ //! @tparam IteratorT
728
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
729
+ //!
730
+ //! @tparam ScanOpT
731
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
732
+ //!
733
+ //! @tparam InitValueT
734
+ //! **[inferred]** Type of the `init_value`
735
+ //!
736
+ //! @tparam NumItemsT
737
+ //! **[inferred]** An integral type representing the number of input elements
738
+ //!
739
+ //! @param[in] d_temp_storage
740
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
741
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
742
+ //!
743
+ //! @param[in,out] temp_storage_bytes
744
+ //! Reference to size in bytes of `d_temp_storage` allocation
745
+ //!
746
+ //! @param[in,out] d_data
747
+ //! Random-access iterator to the sequence of data items
748
+ //!
749
+ //! @param[in] scan_op
750
+ //! Binary associative scan functor
751
+ //!
752
+ //! @param[in] init_value
753
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
754
+ //!
755
+ //! @param[in] num_items
756
+ //! Total number of input items (i.e., the length of `d_in`)
757
+ //!
758
+ //! @param[in] stream
759
+ //! @rst
760
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
761
+ //! @endrst
762
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
763
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
764
+ void* d_temp_storage,
765
+ size_t& temp_storage_bytes,
766
+ IteratorT d_data,
767
+ ScanOpT scan_op,
768
+ InitValueT init_value,
769
+ NumItemsT num_items,
770
+ cudaStream_t stream = 0)
771
+ {
772
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
773
+ }
774
+
775
+ //! @rst
776
+ //! Computes a device-wide exclusive prefix scan using the specified
777
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
778
+ //!
779
+ //! - Supports non-commutative scan operators.
780
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
781
+ //! addition of floating-point types). Results for pseudo-associative
782
+ //! operators may vary from run to run. Additional details can be found in
783
+ //! the @lookback description.
784
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
785
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
786
+ //! shall not overlap in any other way.
787
+ //! - @devicestorage
788
+ //!
789
+ //! Snippet
790
+ //! +++++++++++++++++++++++++++++++++++++++++++++
791
+ //!
792
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
793
+ //!
794
+ //! .. code-block:: c++
795
+ //!
796
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
797
+ //! #include <cuda/std/climits> // for INT_MAX
798
+ //!
799
+ //! // CustomMin functor
800
+ //! struct CustomMin
801
+ //! {
802
+ //! template <typename T>
803
+ //! __host__ __device__ __forceinline__
804
+ //! T operator()(const T &a, const T &b) const {
805
+ //! return (b < a) ? b : a;
806
+ //! }
807
+ //! };
808
+ //!
809
+ //! // Declare, allocate, and initialize device-accessible pointers for
810
+ //! // input and output
811
+ //! int num_items; // e.g., 7
812
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
813
+ //! int *d_out; // e.g., [ , , , , , , ]
814
+ //! int *d_init_iter; // e.g., INT_MAX
815
+ //! CustomMin min_op;
816
+ //!
817
+ //! auto future_init_value =
818
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
819
+ //!
820
+ //! ...
821
+ //!
822
+ //! // Determine temporary device storage requirements for exclusive
823
+ //! // prefix scan
824
+ //! void *d_temp_storage = nullptr;
825
+ //! size_t temp_storage_bytes = 0;
826
+ //! cub::DeviceScan::ExclusiveScan(
827
+ //! d_temp_storage, temp_storage_bytes,
828
+ //! d_in, d_out, min_op, future_init_value, num_items);
829
+ //!
830
+ //! // Allocate temporary storage for exclusive prefix scan
831
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
832
+ //!
833
+ //! // Run exclusive prefix min-scan
834
+ //! cub::DeviceScan::ExclusiveScan(
835
+ //! d_temp_storage, temp_storage_bytes,
836
+ //! d_in, d_out, min_op, future_init_value, num_items);
837
+ //!
838
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
839
+ //!
840
+ //! @endrst
841
+ //!
842
+ //! @tparam InputIteratorT
843
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
844
+ //!
845
+ //! @tparam OutputIteratorT
846
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
847
+ //!
848
+ //! @tparam ScanOpT
849
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
850
+ //!
851
+ //! @tparam InitValueT
852
+ //! **[inferred]** Type of the `init_value`
853
+ //!
854
+ //! @tparam NumItemsT
855
+ //! **[inferred]** An integral type representing the number of input elements
856
+ //!
857
+ //! @param[in] d_temp_storage
858
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
859
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
860
+ //!
861
+ //! @param[in,out] temp_storage_bytes
862
+ //! Reference to size in bytes of `d_temp_storage` allocation
863
+ //!
864
+ //! @param[in] d_in
865
+ //! Pointer to the input sequence of data items
866
+ //!
867
+ //! @param[out] d_out
868
+ //! Pointer to the output sequence of data items
869
+ //!
870
+ //! @param[in] scan_op
871
+ //! Binary associative scan functor
872
+ //!
873
+ //! @param[in] init_value
874
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
875
+ //!
876
+ //! @param[in] num_items
877
+ //! Total number of input items (i.e., the length of `d_in`)
878
+ //!
879
+ //! @param[in] stream
880
+ //! @rst
881
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
882
+ //! @endrst
883
+ template <typename InputIteratorT,
884
+ typename OutputIteratorT,
885
+ typename ScanOpT,
886
+ typename InitValueT,
887
+ typename InitValueIterT = InitValueT*,
888
+ typename NumItemsT = int>
889
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
890
+ void* d_temp_storage,
891
+ size_t& temp_storage_bytes,
892
+ InputIteratorT d_in,
893
+ OutputIteratorT d_out,
894
+ ScanOpT scan_op,
895
+ FutureValue<InitValueT, InitValueIterT> init_value,
896
+ NumItemsT num_items,
897
+ cudaStream_t stream = 0)
898
+ {
899
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
900
+
901
+ // Unsigned integer type for global offsets
902
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
903
+
904
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
905
+ d_temp_storage,
906
+ temp_storage_bytes,
907
+ d_in,
908
+ d_out,
909
+ scan_op,
910
+ detail::InputValue<InitValueT>(init_value),
911
+ num_items,
912
+ stream);
913
+ }
914
+
915
+ //! @rst
916
+ //! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
917
+ //! The ``init_value`` value is provided as a future value.
918
+ //!
919
+ //! - Supports non-commutative scan operators.
920
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
921
+ //! addition of floating-point types). Results for pseudo-associative
922
+ //! operators may vary from run to run. Additional details can be found in
923
+ //! the @lookback description.
924
+ //! - @devicestorage
925
+ //!
926
+ //! Snippet
927
+ //! +++++++++++++++++++++++++++++++++++++++++++++
928
+ //!
929
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
930
+ //!
931
+ //! .. code-block:: c++
932
+ //!
933
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
934
+ //! #include <cuda/std/climits> // for INT_MAX
935
+ //!
936
+ //! // CustomMin functor
937
+ //! struct CustomMin
938
+ //! {
939
+ //! template <typename T>
940
+ //! __host__ __device__ __forceinline__
941
+ //! T operator()(const T &a, const T &b) const {
942
+ //! return (b < a) ? b : a;
943
+ //! }
944
+ //! };
945
+ //!
946
+ //! // Declare, allocate, and initialize device-accessible pointers for
947
+ //! // input and output
948
+ //! int num_items; // e.g., 7
949
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
950
+ //! int *d_init_iter; // e.g., INT_MAX
951
+ //! CustomMin min_op;
952
+ //!
953
+ //! auto future_init_value =
954
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
955
+ //!
956
+ //! ...
957
+ //!
958
+ //! // Determine temporary device storage requirements for exclusive
959
+ //! // prefix scan
960
+ //! void *d_temp_storage = nullptr;
961
+ //! size_t temp_storage_bytes = 0;
962
+ //! cub::DeviceScan::ExclusiveScan(
963
+ //! d_temp_storage, temp_storage_bytes,
964
+ //! d_data, min_op, future_init_value, num_items);
965
+ //!
966
+ //! // Allocate temporary storage for exclusive prefix scan
967
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
968
+ //!
969
+ //! // Run exclusive prefix min-scan
970
+ //! cub::DeviceScan::ExclusiveScan(
971
+ //! d_temp_storage, temp_storage_bytes,
972
+ //! d_data, min_op, future_init_value, num_items);
973
+ //!
974
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
975
+ //!
976
+ //! @endrst
977
+ //!
978
+ //! @tparam IteratorT
979
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
980
+ //!
981
+ //! @tparam ScanOpT
982
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
983
+ //!
984
+ //! @tparam InitValueT
985
+ //! **[inferred]** Type of the `init_value`
986
+ //!
987
+ //! @tparam NumItemsT
988
+ //! **[inferred]** An integral type representing the number of input elements
989
+ //!
990
+ //! @param[in] d_temp_storage
991
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
992
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
993
+ //!
994
+ //! @param[in,out] temp_storage_bytes
995
+ //! Reference to size in bytes of `d_temp_storage` allocation
996
+ //!
997
+ //! @param[in,out] d_data
998
+ //! Pointer to the sequence of data items
999
+ //!
1000
+ //! @param[in] scan_op
1001
+ //! Binary associative scan functor
1002
+ //!
1003
+ //! @param[in] init_value
1004
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
1005
+ //!
1006
+ //! @param[in] num_items
1007
+ //! Total number of input items (i.e., the length of `d_in`)
1008
+ //!
1009
+ //! @param[in] stream
1010
+ //! @rst
1011
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1012
+ //! @endrst
1013
+ template <typename IteratorT,
1014
+ typename ScanOpT,
1015
+ typename InitValueT,
1016
+ typename InitValueIterT = InitValueT*,
1017
+ typename NumItemsT = int>
1018
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
1019
+ void* d_temp_storage,
1020
+ size_t& temp_storage_bytes,
1021
+ IteratorT d_data,
1022
+ ScanOpT scan_op,
1023
+ FutureValue<InitValueT, InitValueIterT> init_value,
1024
+ NumItemsT num_items,
1025
+ cudaStream_t stream = 0)
1026
+ {
1027
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
1028
+ }
1029
+
1030
+ //! @} end member group
1031
+
1032
+ //! @name Inclusive scans
1033
+ //! @{
1034
+
1035
+ //! @rst
1036
+ //! Computes a device-wide inclusive prefix sum.
1037
+ //!
1038
+ //! - Supports non-commutative sum operators.
1039
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1040
+ //! addition of floating-point types). Results for pseudo-associative
1041
+ //! operators may vary from run to run. Additional details can be found in
1042
+ //! the @lookback description.
1043
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1044
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1045
+ //! shall not overlap in any other way.
1046
+ //! - @devicestorage
1047
+ //!
1048
+ //! Snippet
1049
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1050
+ //!
1051
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1052
+ //!
1053
+ //! .. code-block:: c++
1054
+ //!
1055
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1056
+ //!
1057
+ //! // Declare, allocate, and initialize device-accessible pointers for
1058
+ //! // input and output
1059
+ //! int num_items; // e.g., 7
1060
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1061
+ //! int *d_out; // e.g., [ , , , , , , ]
1062
+ //! ...
1063
+ //!
1064
+ //! // Determine temporary device storage requirements for inclusive
1065
+ //! // prefix sum
1066
+ //! void *d_temp_storage = nullptr;
1067
+ //! size_t temp_storage_bytes = 0;
1068
+ //! cub::DeviceScan::InclusiveSum(
1069
+ //! d_temp_storage, temp_storage_bytes,
1070
+ //! d_in, d_out, num_items);
1071
+ //!
1072
+ //! // Allocate temporary storage for inclusive prefix sum
1073
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1074
+ //!
1075
+ //! // Run inclusive prefix sum
1076
+ //! cub::DeviceScan::InclusiveSum(
1077
+ //! d_temp_storage, temp_storage_bytes,
1078
+ //! d_in, d_out, num_items);
1079
+ //!
1080
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
1081
+ //!
1082
+ //! @endrst
1083
+ //!
1084
+ //! @tparam InputIteratorT
1085
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1086
+ //!
1087
+ //! @tparam OutputIteratorT
1088
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1089
+ //!
1090
+ //! @tparam NumItemsT
1091
+ //! **[inferred]** An integral type representing the number of input elements
1092
+ //!
1093
+ //! @param[in] d_temp_storage
1094
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1095
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1096
+ //!
1097
+ //! @param[in,out] temp_storage_bytes
1098
+ //! Reference to size in bytes of `d_temp_storage` allocation
1099
+ //!
1100
+ //! @param[in] d_in
1101
+ //! Random-access iterator to the input sequence of data items
1102
+ //!
1103
+ //! @param[out] d_out
1104
+ //! Random-access iterator to the output sequence of data items
1105
+ //!
1106
+ //! @param[in] num_items
1107
+ //! Total number of input items (i.e., the length of `d_in`)
1108
+ //!
1109
+ //! @param[in] stream
1110
+ //! @rst
1111
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1112
+ //! @endrst
1113
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1114
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1115
+ void* d_temp_storage,
1116
+ size_t& temp_storage_bytes,
1117
+ InputIteratorT d_in,
1118
+ OutputIteratorT d_out,
1119
+ NumItemsT num_items,
1120
+ cudaStream_t stream = 0)
1121
+ {
1122
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
1123
+
1124
+ // Unsigned integer type for global offsets
1125
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1126
+
1127
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
1128
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
1129
+ }
1130
+
1131
+ //! @rst
1132
+ //! Computes a device-wide inclusive prefix sum in-place.
1133
+ //!
1134
+ //! - Supports non-commutative sum operators.
1135
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1136
+ //! addition of floating-point types). Results for pseudo-associative
1137
+ //! operators may vary from run to run. Additional details can be found in
1138
+ //! the @lookback description.
1139
+ //! - @devicestorage
1140
+ //!
1141
+ //! Snippet
1142
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1143
+ //!
1144
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1145
+ //!
1146
+ //! .. code-block:: c++
1147
+ //!
1148
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1149
+ //!
1150
+ //! // Declare, allocate, and initialize device-accessible pointers for
1151
+ //! // input and output
1152
+ //! int num_items; // e.g., 7
1153
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1154
+ //! ...
1155
+ //!
1156
+ //! // Determine temporary device storage requirements for inclusive
1157
+ //! // prefix sum
1158
+ //! void *d_temp_storage = nullptr;
1159
+ //! size_t temp_storage_bytes = 0;
1160
+ //! cub::DeviceScan::InclusiveSum(
1161
+ //! d_temp_storage, temp_storage_bytes,
1162
+ //! d_data, num_items);
1163
+ //!
1164
+ //! // Allocate temporary storage for inclusive prefix sum
1165
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1166
+ //!
1167
+ //! // Run inclusive prefix sum
1168
+ //! cub::DeviceScan::InclusiveSum(
1169
+ //! d_temp_storage, temp_storage_bytes,
1170
+ //! d_data, num_items);
1171
+ //!
1172
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
1173
+ //!
1174
+ //! @endrst
1175
+ //!
1176
+ //! @tparam IteratorT
1177
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1178
+ //!
1179
+ //! @tparam NumItemsT
1180
+ //! **[inferred]** An integral type representing the number of input elements
1181
+ //!
1182
+ //! @param[in] d_temp_storage
1183
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1184
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1185
+ //!
1186
+ //! @param[in,out] temp_storage_bytes
1187
+ //! Reference to size in bytes of `d_temp_storage` allocation
1188
+ //!
1189
+ //! @param[in,out] d_data
1190
+ //! Random-access iterator to the sequence of data items
1191
+ //!
1192
+ //! @param[in] num_items
1193
+ //! Total number of input items (i.e., the length of `d_in`)
1194
+ //!
1195
+ //! @param[in] stream
1196
+ //! @rst
1197
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1198
+ //! @endrst
1199
+ template <typename IteratorT, typename NumItemsT>
1200
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1201
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
1202
+ {
1203
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
1204
+ }
1205
+
1206
+ //! @rst
1207
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1208
+ //!
1209
+ //! - Supports non-commutative scan operators.
1210
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1211
+ //! addition of floating-point types). Results for pseudo-associative
1212
+ //! operators may vary from run to run. Additional details can be found in
1213
+ //! the @lookback description.
1214
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1215
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1216
+ //! shall not overlap in any other way.
1217
+ //! - @devicestorage
1218
+ //!
1219
+ //! Snippet
1220
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1221
+ //!
1222
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1223
+ //!
1224
+ //! .. code-block:: c++
1225
+ //!
1226
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1227
+ //! #include <cuda/std/climits> // for INT_MAX
1228
+ //!
1229
+ //! // CustomMin functor
1230
+ //! struct CustomMin
1231
+ //! {
1232
+ //! template <typename T>
1233
+ //! __host__ __device__ __forceinline__
1234
+ //! T operator()(const T &a, const T &b) const {
1235
+ //! return (b < a) ? b : a;
1236
+ //! }
1237
+ //! };
1238
+ //!
1239
+ //! // Declare, allocate, and initialize device-accessible pointers for
1240
+ //! // input and output
1241
+ //! int num_items; // e.g., 7
1242
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1243
+ //! int *d_out; // e.g., [ , , , , , , ]
1244
+ //! CustomMin min_op;
1245
+ //! ...
1246
+ //!
1247
+ //! // Determine temporary device storage requirements for inclusive
1248
+ //! // prefix scan
1249
+ //! void *d_temp_storage = nullptr;
1250
+ //! size_t temp_storage_bytes = 0;
1251
+ //! cub::DeviceScan::InclusiveScan(
1252
+ //! d_temp_storage, temp_storage_bytes,
1253
+ //! d_in, d_out, min_op, num_items);
1254
+ //!
1255
+ //! // Allocate temporary storage for inclusive prefix scan
1256
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1257
+ //!
1258
+ //! // Run inclusive prefix min-scan
1259
+ //! cub::DeviceScan::InclusiveScan(
1260
+ //! d_temp_storage, temp_storage_bytes,
1261
+ //! d_in, d_out, min_op, num_items);
1262
+ //!
1263
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1264
+ //!
1265
+ //! @endrst
1266
+ //!
1267
+ //! @tparam InputIteratorT
1268
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1269
+ //!
1270
+ //! @tparam OutputIteratorT
1271
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1272
+ //!
1273
+ //! @tparam ScanOpT
1274
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1275
+ //!
1276
+ //! @tparam NumItemsT
1277
+ //! **[inferred]** An integral type representing the number of input elements
1278
+ //!
1279
+ //! @param[in]
1280
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1281
+ //! When `nullptr`, the required allocation size is written to
1282
+ //! `temp_storage_bytes` and no work is done.
1283
+ //!
1284
+ //! @param[in,out] temp_storage_bytes
1285
+ //! Reference to size in bytes of `d_temp_storage` allocation
1286
+ //!
1287
+ //! @param[in] d_in
1288
+ //! Random-access iterator to the input sequence of data items
1289
+ //!
1290
+ //! @param[out] d_out
1291
+ //! Random-access iterator to the output sequence of data items
1292
+ //!
1293
+ //! @param[in] scan_op
1294
+ //! Binary associative scan functor
1295
+ //!
1296
+ //! @param[in] num_items
1297
+ //! Total number of input items (i.e., the length of `d_in`)
1298
+ //!
1299
+ //! @param[in] stream
1300
+ //! @rst
1301
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1302
+ //! @endrst
1303
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1304
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1305
+ void* d_temp_storage,
1306
+ size_t& temp_storage_bytes,
1307
+ InputIteratorT d_in,
1308
+ OutputIteratorT d_out,
1309
+ ScanOpT scan_op,
1310
+ NumItemsT num_items,
1311
+ cudaStream_t stream = 0)
1312
+ {
1313
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1314
+
1315
+ // Unsigned integer type for global offsets
1316
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1317
+
1318
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1319
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1320
+ }
1321
+
1322
+ //! @rst
1323
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1324
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1325
+ //! is assigned to ``*d_out``.
1326
+ //!
1327
+ //! - Supports non-commutative scan operators.
1328
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1329
+ //! addition of floating-point types). Results for pseudo-associative
1330
+ //! operators may vary from run to run. Additional details can be found in
1331
+ //! the @lookback description.
1332
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1333
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1334
+ //! shall not overlap in any other way.
1335
+ //! - @devicestorage
1336
+ //!
1337
+ //! Snippet
1338
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1339
+ //!
1340
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1341
+ //!
1342
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1343
+ //! :language: c++
1344
+ //! :dedent:
1345
+ //! :start-after: example-begin device-inclusive-scan
1346
+ //! :end-before: example-end device-inclusive-scan
1347
+ //!
1348
+ //! @endrst
1349
+ //!
1350
+ //! @tparam InputIteratorT
1351
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1352
+ //!
1353
+ //! @tparam OutputIteratorT
1354
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1355
+ //!
1356
+ //! @tparam ScanOpT
1357
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1358
+ //!
1359
+ //! @tparam InitValueT
1360
+ //! **[inferred]** Type of the `init_value`
1361
+ //!
1362
+ //! @tparam NumItemsT
1363
+ //! **[inferred]** An integral type representing the number of input elements
1364
+ //!
1365
+ //! @param[in] d_temp_storage
1366
+ //! Device-accessible allocation of temporary storage.
1367
+ //! When `nullptr`, the required allocation size is written to
1368
+ //! `temp_storage_bytes` and no work is done.
1369
+ //!
1370
+ //! @param[in,out] temp_storage_bytes
1371
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1372
+ //!
1373
+ //! @param[in] d_in
1374
+ //! Random-access iterator to the input sequence of data items
1375
+ //!
1376
+ //! @param[out] d_out
1377
+ //! Random-access iterator to the output sequence of data items
1378
+ //!
1379
+ //! @param[in] scan_op
1380
+ //! Binary associative scan functor
1381
+ //!
1382
+ //! @param[in] init_value
1383
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1384
+ //! is assigned to `*d_out`)
1385
+ //!
1386
+ //! @param[in] num_items
1387
+ //! Total number of input items (i.e., the length of `d_in`)
1388
+ //!
1389
+ //! @param[in] stream
1390
+ //! CUDA stream to launch kernels within.
1391
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1392
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1393
+ void* d_temp_storage,
1394
+ size_t& temp_storage_bytes,
1395
+ InputIteratorT d_in,
1396
+ OutputIteratorT d_out,
1397
+ ScanOpT scan_op,
1398
+ InitValueT init_value,
1399
+ NumItemsT num_items,
1400
+ cudaStream_t stream = 0)
1401
+ {
1402
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1403
+
1404
+ // Unsigned integer type for global offsets
1405
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1406
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1407
+
1408
+ return DispatchScan<
1409
+ InputIteratorT,
1410
+ OutputIteratorT,
1411
+ ScanOpT,
1412
+ detail::InputValue<InitValueT>,
1413
+ OffsetT,
1414
+ AccumT,
1415
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1416
+ temp_storage_bytes,
1417
+ d_in,
1418
+ d_out,
1419
+ scan_op,
1420
+ detail::InputValue<InitValueT>(init_value),
1421
+ num_items,
1422
+ stream);
1423
+ }
1424
+
1425
+ //! @rst
1426
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1427
+ //!
1428
+ //! - Supports non-commutative scan operators.
1429
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1430
+ //! addition of floating-point types). Results for pseudo-associative
1431
+ //! operators may vary from run to run. Additional details can be found in
1432
+ //! the @lookback description.
1433
+ //! - @devicestorage
1434
+ //!
1435
+ //! Snippet
1436
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1437
+ //!
1438
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1439
+ //!
1440
+ //! .. code-block:: c++
1441
+ //!
1442
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1443
+ //! #include <cuda/std/climits> // for INT_MAX
1444
+ //!
1445
+ //! // CustomMin functor
1446
+ //! struct CustomMin
1447
+ //! {
1448
+ //! template <typename T>
1449
+ //! __host__ __device__ __forceinline__
1450
+ //! T operator()(const T &a, const T &b) const {
1451
+ //! return (b < a) ? b : a;
1452
+ //! }
1453
+ //! };
1454
+ //!
1455
+ //! // Declare, allocate, and initialize device-accessible pointers for
1456
+ //! // input and output
1457
+ //! int num_items; // e.g., 7
1458
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1459
+ //! CustomMin min_op;
1460
+ //! ...
1461
+ //!
1462
+ //! // Determine temporary device storage requirements for inclusive
1463
+ //! // prefix scan
1464
+ //! void *d_temp_storage = nullptr;
1465
+ //! size_t temp_storage_bytes = 0;
1466
+ //! cub::DeviceScan::InclusiveScan(
1467
+ //! d_temp_storage, temp_storage_bytes,
1468
+ //! d_data, min_op, num_items);
1469
+ //!
1470
+ //! // Allocate temporary storage for inclusive prefix scan
1471
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1472
+ //!
1473
+ //! // Run inclusive prefix min-scan
1474
+ //! cub::DeviceScan::InclusiveScan(
1475
+ //! d_temp_storage, temp_storage_bytes,
1476
+ //! d_in, d_out, min_op, num_items);
1477
+ //!
1478
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1479
+ //!
1480
+ //! @endrst
1481
+ //!
1482
+ //! @tparam IteratorT
1483
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1484
+ //!
1485
+ //! @tparam ScanOpT
1486
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1487
+ //!
1488
+ //! @tparam NumItemsT
1489
+ //! **[inferred]** An integral type representing the number of input elements
1490
+ //!
1491
+ //! @param[in]
1492
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1493
+ //! When `nullptr`, the required allocation size is written to
1494
+ //! `temp_storage_bytes` and no work is done.
1495
+ //!
1496
+ //! @param[in,out] temp_storage_bytes
1497
+ //! Reference to size in bytes of `d_temp_storage` allocation
1498
+ //!
1499
+ //! @param[in] d_data
1500
+ //! Random-access iterator to the sequence of data items
1501
+ //!
1502
+ //! @param[in] scan_op
1503
+ //! Binary associative scan functor
1504
+ //!
1505
+ //! @param[in] num_items
1506
+ //! Total number of input items (i.e., the length of `d_in`)
1507
+ //!
1508
+ //! @param[in] stream
1509
+ //! @rst
1510
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1511
+ //! @endrst
1512
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1513
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1514
+ void* d_temp_storage,
1515
+ size_t& temp_storage_bytes,
1516
+ IteratorT d_data,
1517
+ ScanOpT scan_op,
1518
+ NumItemsT num_items,
1519
+ cudaStream_t stream = 0)
1520
+ {
1521
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1522
+ }
1523
+ //! @} end member group
1524
+
1525
+ //! @name Scans by key
1526
+ //! @{
1527
+
1528
+ //! @rst
1529
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1530
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1531
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1532
+ //!
1533
+ //! - Supports non-commutative sum operators.
1534
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1535
+ //! addition of floating-point types). Results for pseudo-associative
1536
+ //! operators may vary from run to run. Additional details can be found in
1537
+ //! the @lookback description.
1538
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1539
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1540
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1541
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1542
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1543
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1544
+ //! - @devicestorage
1545
+ //!
1546
+ //! Snippet
1547
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1548
+ //!
1549
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1550
+ //!
1551
+ //! .. code-block:: c++
1552
+ //!
1553
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1554
+ //!
1555
+ //! // Declare, allocate, and initialize device-accessible pointers for
1556
+ //! // input and output
1557
+ //! int num_items; // e.g., 7
1558
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1559
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1560
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1561
+ //! ...
1562
+ //!
1563
+ //! // Determine temporary device storage requirements
1564
+ //! void *d_temp_storage = nullptr;
1565
+ //! size_t temp_storage_bytes = 0;
1566
+ //! cub::DeviceScan::ExclusiveSumByKey(
1567
+ //! d_temp_storage, temp_storage_bytes,
1568
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1569
+ //!
1570
+ //! // Allocate temporary storage
1571
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1572
+ //!
1573
+ //! // Run exclusive prefix sum
1574
+ //! cub::DeviceScan::ExclusiveSumByKey(
1575
+ //! d_temp_storage, temp_storage_bytes,
1576
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1577
+ //!
1578
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1579
+ //!
1580
+ //! @endrst
1581
+ //!
1582
+ //! @tparam KeysInputIteratorT
1583
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1584
+ //!
1585
+ //! @tparam ValuesInputIteratorT
1586
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1587
+ //!
1588
+ //! @tparam ValuesOutputIteratorT
1589
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1590
+ //!
1591
+ //! @tparam EqualityOpT
1592
+ //! **[inferred]** Functor type having member
1593
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1594
+ //!
1595
+ //! @tparam NumItemsT
1596
+ //! **[inferred]** An integral type representing the number of input elements
1597
+ //!
1598
+ //! @param[in] d_temp_storage
1599
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1600
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1601
+ //!
1602
+ //! @param[in,out] temp_storage_bytes
1603
+ //! Reference to size in bytes of `d_temp_storage` allocation
1604
+ //!
1605
+ //! @param[in] d_keys_in
1606
+ //! Random-access input iterator to the input sequence of key items
1607
+ //!
1608
+ //! @param[in] d_values_in
1609
+ //! Random-access input iterator to the input sequence of value items
1610
+ //!
1611
+ //! @param[out] d_values_out
1612
+ //! Random-access output iterator to the output sequence of value items
1613
+ //!
1614
+ //! @param[in] num_items
1615
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1616
+ //!
1617
+ //! @param[in] equality_op
1618
+ //! Binary functor that defines the equality of keys.
1619
+ //! Default is cuda::std::equal_to<>{}.
1620
+ //!
1621
+ //! @param[in] stream
1622
+ //! @rst
1623
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1624
+ //! @endrst
1625
+ template <typename KeysInputIteratorT,
1626
+ typename ValuesInputIteratorT,
1627
+ typename ValuesOutputIteratorT,
1628
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1629
+ typename NumItemsT = uint32_t>
1630
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1631
+ void* d_temp_storage,
1632
+ size_t& temp_storage_bytes,
1633
+ KeysInputIteratorT d_keys_in,
1634
+ ValuesInputIteratorT d_values_in,
1635
+ ValuesOutputIteratorT d_values_out,
1636
+ NumItemsT num_items,
1637
+ EqualityOpT equality_op = EqualityOpT(),
1638
+ cudaStream_t stream = 0)
1639
+ {
1640
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1641
+
1642
+ // Unsigned integer type for global offsets
1643
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1644
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1645
+
1646
+ // Initial value
1647
+ InitT init_value{};
1648
+
1649
+ return DispatchScanByKey<
1650
+ KeysInputIteratorT,
1651
+ ValuesInputIteratorT,
1652
+ ValuesOutputIteratorT,
1653
+ EqualityOpT,
1654
+ ::cuda::std::plus<>,
1655
+ InitT,
1656
+ OffsetT>::Dispatch(d_temp_storage,
1657
+ temp_storage_bytes,
1658
+ d_keys_in,
1659
+ d_values_in,
1660
+ d_values_out,
1661
+ equality_op,
1662
+ ::cuda::std::plus<>{},
1663
+ init_value,
1664
+ num_items,
1665
+ stream);
1666
+ }
1667
+
1668
+ //! @rst
1669
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1670
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by
1671
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1672
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1673
+ //!
1674
+ //! - Supports non-commutative scan operators.
1675
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1676
+ //! addition of floating-point types). Results for pseudo-associative
1677
+ //! operators may vary from run to run. Additional details can be found in
1678
+ //! the @lookback description.
1679
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1680
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1681
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1682
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1683
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1684
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1685
+ //! - @devicestorage
1686
+ //!
1687
+ //! Snippet
1688
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1689
+ //!
1690
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1691
+ //!
1692
+ //! .. code-block:: c++
1693
+ //!
1694
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1695
+ //! #include <cuda/std/climits> // for INT_MAX
1696
+ //!
1697
+ //! // CustomMin functor
1698
+ //! struct CustomMin
1699
+ //! {
1700
+ //! template <typename T>
1701
+ //! __host__ __device__ __forceinline__
1702
+ //! T operator()(const T &a, const T &b) const {
1703
+ //! return (b < a) ? b : a;
1704
+ //! }
1705
+ //! };
1706
+ //!
1707
+ //! // CustomEqual functor
1708
+ //! struct CustomEqual
1709
+ //! {
1710
+ //! template <typename T>
1711
+ //! __host__ __device__ __forceinline__
1712
+ //! T operator()(const T &a, const T &b) const {
1713
+ //! return a == b;
1714
+ //! }
1715
+ //! };
1716
+ //!
1717
+ //! // Declare, allocate, and initialize device-accessible pointers for
1718
+ //! // input and output
1719
+ //! int num_items; // e.g., 7
1720
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1721
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1722
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1723
+ //! CustomMin min_op;
1724
+ //! CustomEqual equality_op;
1725
+ //! ...
1726
+ //!
1727
+ //! // Determine temporary device storage requirements for exclusive
1728
+ //! // prefix scan
1729
+ //! void *d_temp_storage = nullptr;
1730
+ //! size_t temp_storage_bytes = 0;
1731
+ //! cub::DeviceScan::ExclusiveScanByKey(
1732
+ //! d_temp_storage, temp_storage_bytes,
1733
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1734
+ //! (int) INT_MAX, num_items, equality_op);
1735
+ //!
1736
+ //! // Allocate temporary storage for exclusive prefix scan
1737
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1738
+ //!
1739
+ //! // Run exclusive prefix min-scan
1740
+ //! cub::DeviceScan::ExclusiveScanByKey(
1741
+ //! d_temp_storage, temp_storage_bytes,
1742
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1743
+ //! (int) INT_MAX, num_items, equality_op);
1744
+ //!
1745
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1746
+ //!
1747
+ //! @endrst
1748
+ //!
1749
+ //! @tparam KeysInputIteratorT
1750
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1751
+ //!
1752
+ //! @tparam ValuesInputIteratorT
1753
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1754
+ //!
1755
+ //! @tparam ValuesOutputIteratorT
1756
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1757
+ //!
1758
+ //! @tparam ScanOpT
1759
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1760
+ //!
1761
+ //! @tparam InitValueT
1762
+ //! **[inferred]** Type of the `init_value`
1763
+ //!
1764
+ //! @tparam EqualityOpT
1765
+ //! **[inferred]** Functor type having member
1766
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1767
+ //!
1768
+ //! @tparam NumItemsT
1769
+ //! **[inferred]** An integral type representing the number of input elements
1770
+ //!
1771
+ //! @param[in] d_temp_storage
1772
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1773
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1774
+ //!
1775
+ //! @param[in,out] temp_storage_bytes
1776
+ //! Reference to size in bytes of `d_temp_storage` allocation
1777
+ //!
1778
+ //! @param[in] d_keys_in
1779
+ //! Random-access input iterator to the input sequence of key items
1780
+ //!
1781
+ //! @param[in] d_values_in
1782
+ //! Random-access input iterator to the input sequence of value items
1783
+ //!
1784
+ //! @param[out] d_values_out
1785
+ //! Random-access output iterator to the output sequence of value items
1786
+ //!
1787
+ //! @param[in] scan_op
1788
+ //! Binary associative scan functor
1789
+ //!
1790
+ //! @param[in] init_value
1791
+ //! Initial value to seed the exclusive scan (and is assigned to the
1792
+ //! beginning of each segment in `d_values_out`)
1793
+ //!
1794
+ //! @param[in] num_items
1795
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1796
+ //! `d_values_in`)
1797
+ //!
1798
+ //! @param[in] equality_op
1799
+ //! Binary functor that defines the equality of keys.
1800
+ //! Default is cuda::std::equal_to<>{}.
1801
+ //!
1802
+ //! @param[in] stream
1803
+ //! @rst
1804
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1805
+ //! @endrst
1806
+ template <typename KeysInputIteratorT,
1807
+ typename ValuesInputIteratorT,
1808
+ typename ValuesOutputIteratorT,
1809
+ typename ScanOpT,
1810
+ typename InitValueT,
1811
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1812
+ typename NumItemsT = uint32_t>
1813
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1814
+ void* d_temp_storage,
1815
+ size_t& temp_storage_bytes,
1816
+ KeysInputIteratorT d_keys_in,
1817
+ ValuesInputIteratorT d_values_in,
1818
+ ValuesOutputIteratorT d_values_out,
1819
+ ScanOpT scan_op,
1820
+ InitValueT init_value,
1821
+ NumItemsT num_items,
1822
+ EqualityOpT equality_op = EqualityOpT(),
1823
+ cudaStream_t stream = 0)
1824
+ {
1825
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1826
+
1827
+ // Unsigned integer type for global offsets
1828
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1829
+
1830
+ return DispatchScanByKey<
1831
+ KeysInputIteratorT,
1832
+ ValuesInputIteratorT,
1833
+ ValuesOutputIteratorT,
1834
+ EqualityOpT,
1835
+ ScanOpT,
1836
+ InitValueT,
1837
+ OffsetT>::Dispatch(d_temp_storage,
1838
+ temp_storage_bytes,
1839
+ d_keys_in,
1840
+ d_values_in,
1841
+ d_values_out,
1842
+ equality_op,
1843
+ scan_op,
1844
+ init_value,
1845
+ num_items,
1846
+ stream);
1847
+ }
1848
+
1849
+ //! @rst
1850
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1851
+ //!
1852
+ //! - Supports non-commutative sum operators.
1853
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1854
+ //! addition of floating-point types). Results for pseudo-associative
1855
+ //! operators may vary from run to run. Additional details can be found in
1856
+ //! the @lookback description.
1857
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1858
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1859
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1860
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1861
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1862
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1863
+ //! - @devicestorage
1864
+ //!
1865
+ //! Snippet
1866
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1867
+ //!
1868
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1869
+ //!
1870
+ //! .. code-block:: c++
1871
+ //!
1872
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1873
+ //!
1874
+ //! // Declare, allocate, and initialize device-accessible pointers for
1875
+ //! // input and output
1876
+ //! int num_items; // e.g., 7
1877
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1878
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1879
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1880
+ //! ...
1881
+ //!
1882
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1883
+ //! void *d_temp_storage = nullptr;
1884
+ //! size_t temp_storage_bytes = 0;
1885
+ //! cub::DeviceScan::InclusiveSumByKey(
1886
+ //! d_temp_storage, temp_storage_bytes,
1887
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1888
+ //!
1889
+ //! // Allocate temporary storage for inclusive prefix sum
1890
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1891
+ //!
1892
+ //! // Run inclusive prefix sum
1893
+ //! cub::DeviceScan::InclusiveSumByKey(
1894
+ //! d_temp_storage, temp_storage_bytes,
1895
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1896
+ //!
1897
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1898
+ //!
1899
+ //! @endrst
1900
+ //!
1901
+ //! @tparam KeysInputIteratorT
1902
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1903
+ //!
1904
+ //! @tparam ValuesInputIteratorT
1905
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1906
+ //!
1907
+ //! @tparam ValuesOutputIteratorT
1908
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1909
+ //!
1910
+ //! @tparam EqualityOpT
1911
+ //! **[inferred]** Functor type having member
1912
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1913
+ //!
1914
+ //! @tparam NumItemsT
1915
+ //! **[inferred]** An integral type representing the number of input elements
1916
+ //!
1917
+ //! @param[in] d_temp_storage
1918
+ //! Device-accessible allocation of temporary storage.
1919
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1920
+ //!
1921
+ //! @param[in,out] temp_storage_bytes
1922
+ //! Reference to size in bytes of `d_temp_storage` allocation
1923
+ //!
1924
+ //! @param[in] d_keys_in
1925
+ //! Random-access input iterator to the input sequence of key items
1926
+ //!
1927
+ //! @param[in] d_values_in
1928
+ //! Random-access input iterator to the input sequence of value items
1929
+ //!
1930
+ //! @param[out] d_values_out
1931
+ //! Random-access output iterator to the output sequence of value items
1932
+ //!
1933
+ //! @param[in] num_items
1934
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1935
+ //!
1936
+ //! @param[in] equality_op
1937
+ //! Binary functor that defines the equality of keys.
1938
+ //! Default is cuda::std::equal_to<>{}.
1939
+ //!
1940
+ //! @param[in] stream
1941
+ //! @rst
1942
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1943
+ //! @endrst
1944
+ template <typename KeysInputIteratorT,
1945
+ typename ValuesInputIteratorT,
1946
+ typename ValuesOutputIteratorT,
1947
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1948
+ typename NumItemsT = uint32_t>
1949
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
1950
+ void* d_temp_storage,
1951
+ size_t& temp_storage_bytes,
1952
+ KeysInputIteratorT d_keys_in,
1953
+ ValuesInputIteratorT d_values_in,
1954
+ ValuesOutputIteratorT d_values_out,
1955
+ NumItemsT num_items,
1956
+ EqualityOpT equality_op = EqualityOpT(),
1957
+ cudaStream_t stream = 0)
1958
+ {
1959
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
1960
+
1961
+ // Unsigned integer type for global offsets
1962
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1963
+
1964
+ return DispatchScanByKey<
1965
+ KeysInputIteratorT,
1966
+ ValuesInputIteratorT,
1967
+ ValuesOutputIteratorT,
1968
+ EqualityOpT,
1969
+ ::cuda::std::plus<>,
1970
+ NullType,
1971
+ OffsetT>::Dispatch(d_temp_storage,
1972
+ temp_storage_bytes,
1973
+ d_keys_in,
1974
+ d_values_in,
1975
+ d_values_out,
1976
+ equality_op,
1977
+ ::cuda::std::plus<>{},
1978
+ NullType{},
1979
+ num_items,
1980
+ stream);
1981
+ }
1982
+
1983
+ //! @rst
1984
+ //! Computes a device-wide inclusive prefix scan-by-key using the
1985
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
1986
+ //!
1987
+ //! - Supports non-commutative scan operators.
1988
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1989
+ //! addition of floating-point types). Results for pseudo-associative
1990
+ //! operators may vary from run to run. Additional details can be found in
1991
+ //! the @lookback description.
1992
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1993
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1994
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1995
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1996
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1997
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1998
+ //! - @devicestorage
1999
+ //!
2000
+ //! Snippet
2001
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2002
+ //!
2003
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
2004
+ //!
2005
+ //! .. code-block:: c++
2006
+ //!
2007
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
2008
+ //! #include <cuda/std/climits> // for INT_MAX
2009
+ //!
2010
+ //! // CustomMin functor
2011
+ //! struct CustomMin
2012
+ //! {
2013
+ //! template <typename T>
2014
+ //! __host__ __device__ __forceinline__
2015
+ //! T operator()(const T &a, const T &b) const {
2016
+ //! return (b < a) ? b : a;
2017
+ //! }
2018
+ //! };
2019
+ //!
2020
+ //! // CustomEqual functor
2021
+ //! struct CustomEqual
2022
+ //! {
2023
+ //! template <typename T>
2024
+ //! __host__ __device__ __forceinline__
2025
+ //! T operator()(const T &a, const T &b) const {
2026
+ //! return a == b;
2027
+ //! }
2028
+ //! };
2029
+ //!
2030
+ //! // Declare, allocate, and initialize device-accessible pointers for
2031
+ //! // input and output
2032
+ //! int num_items; // e.g., 7
2033
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
2034
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2035
+ //! int *d_values_out; // e.g., [ , , , , , , ]
2036
+ //! CustomMin min_op;
2037
+ //! CustomEqual equality_op;
2038
+ //! ...
2039
+ //!
2040
+ //! // Determine temporary device storage requirements for inclusive prefix scan
2041
+ //! void *d_temp_storage = nullptr;
2042
+ //! size_t temp_storage_bytes = 0;
2043
+ //! cub::DeviceScan::InclusiveScanByKey(
2044
+ //! d_temp_storage, temp_storage_bytes,
2045
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2046
+ //!
2047
+ //! // Allocate temporary storage for inclusive prefix scan
2048
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2049
+ //!
2050
+ //! // Run inclusive prefix min-scan
2051
+ //! cub::DeviceScan::InclusiveScanByKey(
2052
+ //! d_temp_storage, temp_storage_bytes,
2053
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2054
+ //!
2055
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
2056
+ //!
2057
+ //! @endrst
2058
+ //!
2059
+ //! @tparam KeysInputIteratorT
2060
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
2061
+ //!
2062
+ //! @tparam ValuesInputIteratorT
2063
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
2064
+ //!
2065
+ //! @tparam ValuesOutputIteratorT
2066
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
2067
+ //!
2068
+ //! @tparam ScanOpT
2069
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
2070
+ //!
2071
+ //! @tparam EqualityOpT
2072
+ //! **[inferred]** Functor type having member
2073
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
2074
+ //!
2075
+ //! @tparam NumItemsT
2076
+ //! **[inferred]** An integral type representing the number of input elements
2077
+ //!
2078
+ //! @param[in] d_temp_storage
2079
+ //! Device-accessible allocation of temporary storage.
2080
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
2081
+ //!
2082
+ //! @param[in,out] temp_storage_bytes
2083
+ //! Reference to size in bytes of `d_temp_storage` allocation
2084
+ //!
2085
+ //! @param[in] d_keys_in
2086
+ //! Random-access input iterator to the input sequence of key items
2087
+ //!
2088
+ //! @param[in] d_values_in
2089
+ //! Random-access input iterator to the input sequence of value items
2090
+ //!
2091
+ //! @param[out] d_values_out
2092
+ //! Random-access output iterator to the output sequence of value items
2093
+ //!
2094
+ //! @param[in] scan_op
2095
+ //! Binary associative scan functor
2096
+ //!
2097
+ //! @param[in] num_items
2098
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
2099
+ //!
2100
+ //! @param[in] equality_op
2101
+ //! Binary functor that defines the equality of keys.
2102
+ //! Default is cuda::std::equal_to<>{}.
2103
+ //!
2104
+ //! @param[in] stream
2105
+ //! @rst
2106
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2107
+ //! @endrst
2108
+ template <typename KeysInputIteratorT,
2109
+ typename ValuesInputIteratorT,
2110
+ typename ValuesOutputIteratorT,
2111
+ typename ScanOpT,
2112
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2113
+ typename NumItemsT = uint32_t>
2114
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
2115
+ void* d_temp_storage,
2116
+ size_t& temp_storage_bytes,
2117
+ KeysInputIteratorT d_keys_in,
2118
+ ValuesInputIteratorT d_values_in,
2119
+ ValuesOutputIteratorT d_values_out,
2120
+ ScanOpT scan_op,
2121
+ NumItemsT num_items,
2122
+ EqualityOpT equality_op = EqualityOpT(),
2123
+ cudaStream_t stream = 0)
2124
+ {
2125
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
2126
+
2127
+ // Unsigned integer type for global offsets
2128
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2129
+
2130
+ return DispatchScanByKey<
2131
+ KeysInputIteratorT,
2132
+ ValuesInputIteratorT,
2133
+ ValuesOutputIteratorT,
2134
+ EqualityOpT,
2135
+ ScanOpT,
2136
+ NullType,
2137
+ OffsetT>::Dispatch(d_temp_storage,
2138
+ temp_storage_bytes,
2139
+ d_keys_in,
2140
+ d_values_in,
2141
+ d_values_out,
2142
+ equality_op,
2143
+ scan_op,
2144
+ NullType(),
2145
+ num_items,
2146
+ stream);
2147
+ }
2148
+
2149
+ //! @} end member group
2150
+ };
2151
+
2152
+ CUB_NAMESPACE_END