cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2212 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/detail/device_memory_resource.cuh>
47
+ #include <cub/detail/temporary_storage.cuh>
48
+ #include <cub/device/dispatch/dispatch_scan.cuh>
49
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
50
+ #include <cub/thread/thread_operators.cuh>
51
+
52
+ #include <cuda/__execution/determinism.h>
53
+ #include <cuda/__execution/require.h>
54
+ #include <cuda/__execution/tune.h>
55
+ #include <cuda/__memory_resource/get_memory_resource.h>
56
+ #include <cuda/__stream/get_stream.h>
57
+ #include <cuda/std/__execution/env.h>
58
+ #include <cuda/std/__functional/invoke.h>
59
+
60
+ CUB_NAMESPACE_BEGIN
61
+
62
+ namespace detail::scan
63
+ {
64
+ struct get_tuning_query_t
65
+ {};
66
+
67
+ template <class Derived>
68
+ struct tuning
69
+ {
70
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr Derived query(const get_tuning_query_t&) const noexcept
71
+ {
72
+ return static_cast<const Derived&>(*this);
73
+ }
74
+ };
75
+
76
+ struct default_tuning : tuning<default_tuning>
77
+ {
78
+ template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
79
+ using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
80
+ };
81
+
82
+ } // namespace detail::scan
83
+
84
+ //! @rst
85
+ //! DeviceScan provides device-wide, parallel operations for computing a
86
+ //! prefix scan across a sequence of data items residing within
87
+ //! device-accessible memory.
88
+ //!
89
+ //! Overview
90
+ //! +++++++++++++++++++++++++++++++++++++++++++++
91
+ //!
92
+ //! Given a sequence of input elements and a binary reduction operator, a
93
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
94
+ //! sequence where each element is computed to be the reduction of the elements
95
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
96
+ //! with the addition operator. The term *inclusive* indicates that the
97
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
98
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
99
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
100
+ //! output sequences are the same, the scan is performed in-place.
101
+ //!
102
+ //! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
103
+ //! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
104
+ //!
105
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
106
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
107
+ //! only a single pass through the input data, as described in our 2016 technical
108
+ //! report [1]_. The central idea is to leverage a small, constant factor of
109
+ //! redundant work in order to overlap the latencies of global prefix
110
+ //! propagation with local computation. As such, our algorithm requires only
111
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
112
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
113
+ //!
114
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
115
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
116
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
117
+ //!
118
+ //! Usage Considerations
119
+ //! +++++++++++++++++++++++++++++++++++++++++++++
120
+ //!
121
+ //! @cdp_class{DeviceScan}
122
+ //!
123
+ //! Performance
124
+ //! +++++++++++++++++++++++++++++++++++++++++++++
125
+ //!
126
+ //! @linear_performance{prefix scan}
127
+ //!
128
+ //! @endrst
129
+ struct DeviceScan
130
+ {
131
+ //! @cond
132
+ template <typename TuningEnvT,
133
+ typename InputIteratorT,
134
+ typename OutputIteratorT,
135
+ typename ScanOpT,
136
+ typename InitValueT,
137
+ typename NumItemsT,
138
+ ::cuda::execution::determinism::__determinism_t Determinism,
139
+ ForceInclusive EnforceInclusive = ForceInclusive::No>
140
+ CUB_RUNTIME_FUNCTION static cudaError_t scan_impl_determinism(
141
+ void* d_temp_storage,
142
+ size_t& temp_storage_bytes,
143
+ InputIteratorT d_in,
144
+ OutputIteratorT d_out,
145
+ ScanOpT scan_op,
146
+ InitValueT init,
147
+ NumItemsT num_items,
148
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
149
+ cudaStream_t stream)
150
+ {
151
+ using scan_tuning_t = ::cuda::std::execution::
152
+ __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
153
+
154
+ // Unsigned integer type for global offsets
155
+ using offset_t = detail::choose_offset_t<NumItemsT>;
156
+
157
+ using accum_t =
158
+ ::cuda::std::__accumulator_t<ScanOpT,
159
+ cub::detail::it_value_t<InputIteratorT>,
160
+ ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
161
+ cub::detail::it_value_t<InputIteratorT>,
162
+ typename InitValueT::value_type>>;
163
+
164
+ using policy_t = typename scan_tuning_t::
165
+ template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
166
+
167
+ using dispatch_t =
168
+ DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
169
+
170
+ return dispatch_t::Dispatch(
171
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
172
+ }
173
+ //! @endcond
174
+
175
+ //! @cond
176
+ template <typename InputIteratorT,
177
+ typename OutputIteratorT,
178
+ typename ScanOpT,
179
+ typename InitValueT,
180
+ typename NumItemsT,
181
+ ForceInclusive EnforceInclusive = ForceInclusive::No,
182
+ typename EnvT>
183
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_impl_env(
184
+ InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init, NumItemsT num_items, EnvT env)
185
+ {
186
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
187
+ "Determinism should be used inside requires to have an effect.");
188
+
189
+ using requirements_t =
190
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
191
+
192
+ using requested_determinism_t =
193
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
194
+ _CUDA_EXEC::determinism::__get_determinism_t,
195
+ _CUDA_EXEC::determinism::run_to_run_t>;
196
+
197
+ // Static assert to reject gpu_to_gpu determinism since it's not implemented
198
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
199
+ "gpu_to_gpu determinism is not supported");
200
+
201
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::not_guaranteed_t>,
202
+ "not_guaranteed determinism is not supported");
203
+
204
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
205
+
206
+ // Query relevant properties from the environment
207
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
208
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
209
+
210
+ void* d_temp_storage = nullptr;
211
+ size_t temp_storage_bytes = 0;
212
+
213
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
214
+
215
+ // Query the required temporary storage size
216
+ cudaError_t error = scan_impl_determinism<tuning_t>(
217
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
218
+
219
+ if (error != cudaSuccess)
220
+ {
221
+ return error;
222
+ }
223
+
224
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
225
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
226
+ if (error != cudaSuccess)
227
+ {
228
+ return error;
229
+ }
230
+
231
+ // Run the algorithm
232
+ error = scan_impl_determinism<tuning_t>(
233
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
234
+
235
+ // Try to deallocate regardless of the error to avoid memory leaks
236
+ cudaError_t deallocate_error =
237
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
238
+
239
+ if (error != cudaSuccess)
240
+ {
241
+ // Reduction error takes precedence over deallocation error since it happens first
242
+ return error;
243
+ }
244
+
245
+ return deallocate_error;
246
+ }
247
+ //! @endcond
248
+
249
+ //! @name Exclusive scans
250
+ //! @{
251
+
252
+ //! @rst
253
+ //! Computes a device-wide exclusive prefix sum.
254
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
255
+ //!
256
+ //! - Supports non-commutative sum operators.
257
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
258
+ //! addition of floating-point types). Results for pseudo-associative
259
+ //! operators may vary from run to run. Additional details can be found in
260
+ //! the @lookback description.
261
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
262
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
263
+ //! shall not overlap in any other way.
264
+ //! - @devicestorage
265
+ //!
266
+ //! Snippet
267
+ //! +++++++++++++++++++++++++++++++++++++++++++++
268
+ //!
269
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
270
+ //! device vector.
271
+ //!
272
+ //! .. code-block:: c++
273
+ //!
274
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
275
+ //!
276
+ //! // Declare, allocate, and initialize device-accessible pointers for
277
+ //! // input and output
278
+ //! int num_items; // e.g., 7
279
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
280
+ //! int *d_out; // e.g., [ , , , , , , ]
281
+ //! ...
282
+ //!
283
+ //! // Determine temporary device storage requirements
284
+ //! void *d_temp_storage = nullptr;
285
+ //! size_t temp_storage_bytes = 0;
286
+ //! cub::DeviceScan::ExclusiveSum(
287
+ //! d_temp_storage, temp_storage_bytes,
288
+ //! d_in, d_out, num_items);
289
+ //!
290
+ //! // Allocate temporary storage
291
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
292
+ //!
293
+ //! // Run exclusive prefix sum
294
+ //! cub::DeviceScan::ExclusiveSum(
295
+ //! d_temp_storage, temp_storage_bytes,
296
+ //! d_in, d_out, num_items);
297
+ //!
298
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
299
+ //!
300
+ //! @endrst
301
+ //!
302
+ //! @tparam InputIteratorT
303
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
304
+ //!
305
+ //! @tparam OutputIteratorT
306
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
307
+ //!
308
+ //! @tparam NumItemsT
309
+ //! **[inferred]** An integral type representing the number of input elements
310
+ //!
311
+ //! @param[in] d_temp_storage
312
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
313
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
314
+ //!
315
+ //! @param[in,out] temp_storage_bytes
316
+ //! Reference to size in bytes of `d_temp_storage` allocation
317
+ //!
318
+ //! @param[in] d_in
319
+ //! Random-access iterator to the input sequence of data items
320
+ //!
321
+ //! @param[out] d_out
322
+ //! Random-access iterator to the output sequence of data items
323
+ //!
324
+ //! @param[in] num_items
325
+ //! Total number of input items (i.e., the length of `d_in`)
326
+ //!
327
+ //! @param[in] stream
328
+ //! @rst
329
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
330
+ //! @endrst
331
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
332
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
333
+ void* d_temp_storage,
334
+ size_t& temp_storage_bytes,
335
+ InputIteratorT d_in,
336
+ OutputIteratorT d_out,
337
+ NumItemsT num_items,
338
+ cudaStream_t stream = 0)
339
+ {
340
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
341
+
342
+ // Unsigned integer type for global offsets
343
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
344
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
345
+
346
+ // Initial value
347
+ InitT init_value{};
348
+
349
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
350
+ Dispatch(d_temp_storage,
351
+ temp_storage_bytes,
352
+ d_in,
353
+ d_out,
354
+ ::cuda::std::plus<>{},
355
+ detail::InputValue<InitT>(init_value),
356
+ num_items,
357
+ stream);
358
+ }
359
+
360
+ //! @rst
361
+ //! Computes a device-wide exclusive prefix sum.
362
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
363
+ //!
364
+ //! - Supports non-commutative sum operators.
365
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
366
+ //! addition of floating-point types). Results for pseudo-associative
367
+ //! operators may vary from run to run. Additional details can be found in
368
+ //! the @lookback description.
369
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
370
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
371
+ //! shall not overlap in any other way.
372
+ //! - @devicestorage
373
+ //!
374
+ //! Preconditions
375
+ //! +++++++++++++
376
+ //!
377
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
378
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
379
+ //! shall not overlap in any other way.
380
+ //! - ``d_in`` and ``d_out`` must not be null pointers
381
+ //!
382
+ //! Snippet
383
+ //! +++++++++++++++++++++++++++++++++++++++++++++
384
+ //!
385
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
386
+ //! device vector of ``float`` data elements.
387
+ //!
388
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
389
+ //! :language: c++
390
+ //! :dedent:
391
+ //! :start-after: example-begin exclusive-sum-env-determinism
392
+ //! :end-before: example-end exclusive-sum-env-determinism
393
+ //!
394
+ //! @endrst
395
+ //!
396
+ //! @tparam InputIteratorT
397
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
398
+ //!
399
+ //! @tparam OutputIteratorT
400
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
401
+ //!
402
+ //! @tparam NumItemsT
403
+ //! **[inferred]** An integral type representing the number of input elements
404
+ //!
405
+ //! @tparam EnvT
406
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
407
+ //!
408
+ //! @param[in] d_in
409
+ //! Random-access iterator to the input sequence of data items
410
+ //!
411
+ //! @param[out] d_out
412
+ //! Random-access iterator to the output sequence of data items
413
+ //!
414
+ //! @param[in] num_items
415
+ //! Total number of input items (i.e., the length of `d_in`)
416
+ //!
417
+ //! @param[in] env
418
+ //! @rst
419
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
420
+ //! @endrst
421
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT, typename EnvT = _CUDA_STD_EXEC::env<>>
422
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
423
+ ExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
424
+ {
425
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");
426
+
427
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
428
+
429
+ // Initial value
430
+ InitT init_value{};
431
+
432
+ return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<InitT>(init_value), num_items, env);
433
+ }
434
+
435
+ //! @rst
436
+ //! Computes a device-wide exclusive prefix sum in-place.
437
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
438
+ //!
439
+ //! - Supports non-commutative sum operators.
440
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
441
+ //! addition of floating-point types). Results for pseudo-associative
442
+ //! operators may vary from run to run. Additional details can be found in
443
+ //! the @lookback description.
444
+ //! - @devicestorage
445
+ //!
446
+ //! Snippet
447
+ //! +++++++++++++++++++++++++++++++++++++++++++++
448
+ //!
449
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
450
+ //! device vector.
451
+ //!
452
+ //! .. code-block:: c++
453
+ //!
454
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
455
+ //!
456
+ //! // Declare, allocate, and initialize device-accessible pointers for
457
+ //! // input and output
458
+ //! int num_items; // e.g., 7
459
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
460
+ //! ...
461
+ //!
462
+ //! // Determine temporary device storage requirements
463
+ //! void *d_temp_storage = nullptr;
464
+ //! size_t temp_storage_bytes = 0;
465
+ //! cub::DeviceScan::ExclusiveSum(
466
+ //! d_temp_storage, temp_storage_bytes,
467
+ //! d_data, num_items);
468
+ //!
469
+ //! // Allocate temporary storage
470
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
471
+ //!
472
+ //! // Run exclusive prefix sum
473
+ //! cub::DeviceScan::ExclusiveSum(
474
+ //! d_temp_storage, temp_storage_bytes,
475
+ //! d_data, num_items);
476
+ //!
477
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
478
+ //!
479
+ //! @endrst
480
+ //!
481
+ //! @tparam IteratorT
482
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
483
+ //!
484
+ //! @tparam NumItemsT
485
+ //! **[inferred]** An integral type representing the number of input elements
486
+ //!
487
+ //! @param[in] d_temp_storage
488
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
489
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
490
+ //!
491
+ //! @param[in,out] temp_storage_bytes
492
+ //! Reference to size in bytes of `d_temp_storage` allocation
493
+ //!
494
+ //! @param[in,out] d_data
495
+ //! Random-access iterator to the sequence of data items
496
+ //!
497
+ //! @param[in] num_items
498
+ //! Total number of input items (i.e., the length of `d_in`)
499
+ //!
500
+ //! @param[in] stream
501
+ //! @rst
502
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
503
+ //! @endrst
504
+ template <typename IteratorT, typename NumItemsT>
505
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
506
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
507
+ {
508
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
509
+ }
510
+
511
+ //! @rst
512
+ //! Computes a device-wide exclusive prefix scan using the specified
513
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
514
+ //! the initial value, and is assigned to ``*d_out``.
515
+ //!
516
+ //! - Supports non-commutative scan operators.
517
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
518
+ //! addition of floating-point types). Results for pseudo-associative
519
+ //! operators may vary from run to run. Additional details can be found in
520
+ //! the @lookback description.
521
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
522
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
523
+ //! shall not overlap in any other way.
524
+ //! - @devicestorage
525
+ //!
526
+ //! Snippet
527
+ //! +++++++++++++++++++++++++++++++++++++++++++++
528
+ //!
529
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
530
+ //!
531
+ //! .. code-block:: c++
532
+ //!
533
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
534
+ //! #include <cuda/std/climits> // for INT_MAX
535
+ //!
536
+ //! // CustomMin functor
537
+ //! struct CustomMin
538
+ //! {
539
+ //! template <typename T>
540
+ //! __host__ __device__ __forceinline__
541
+ //! T operator()(const T &a, const T &b) const {
542
+ //! return (b < a) ? b : a;
543
+ //! }
544
+ //! };
545
+ //!
546
+ //! // Declare, allocate, and initialize device-accessible pointers for
547
+ //! // input and output
548
+ //! int num_items; // e.g., 7
549
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
550
+ //! int *d_out; // e.g., [ , , , , , , ]
551
+ //! CustomMin min_op;
552
+ //! ...
553
+ //!
554
+ //! // Determine temporary device storage requirements for exclusive
555
+ //! // prefix scan
556
+ //! void *d_temp_storage = nullptr;
557
+ //! size_t temp_storage_bytes = 0;
558
+ //! cub::DeviceScan::ExclusiveScan(
559
+ //! d_temp_storage, temp_storage_bytes,
560
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
561
+ //!
562
+ //! // Allocate temporary storage for exclusive prefix scan
563
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
564
+ //!
565
+ //! // Run exclusive prefix min-scan
566
+ //! cub::DeviceScan::ExclusiveScan(
567
+ //! d_temp_storage, temp_storage_bytes,
568
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
569
+ //!
570
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
571
+ //!
572
+ //! @endrst
573
+ //!
574
+ //! @tparam InputIteratorT
575
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
576
+ //!
577
+ //! @tparam OutputIteratorT
578
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
579
+ //!
580
+ //! @tparam ScanOpT
581
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
582
+ //!
583
+ //! @tparam InitValueT
584
+ //! **[inferred]** Type of the `init_value`
585
+ //!
586
+ //! @tparam NumItemsT
587
+ //! **[inferred]** An integral type representing the number of input elements
588
+ //!
589
+ //! @param[in] d_temp_storage
590
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
591
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
592
+ //!
593
+ //! @param[in,out] temp_storage_bytes
594
+ //! Reference to size in bytes of `d_temp_storage` allocation
595
+ //!
596
+ //! @param[in] d_in
597
+ //! Random-access iterator to the input sequence of data items
598
+ //!
599
+ //! @param[out] d_out
600
+ //! Random-access iterator to the output sequence of data items
601
+ //!
602
+ //! @param[in] scan_op
603
+ //! Binary associative scan functor
604
+ //!
605
+ //! @param[in] init_value
606
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
607
+ //!
608
+ //! @param[in] num_items
609
+ //! Total number of input items (i.e., the length of `d_in`)
610
+ //!
611
+ //! @param[in] stream
612
+ //! @rst
613
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
614
+ //! @endrst
615
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
616
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
617
+ void* d_temp_storage,
618
+ size_t& temp_storage_bytes,
619
+ InputIteratorT d_in,
620
+ OutputIteratorT d_out,
621
+ ScanOpT scan_op,
622
+ InitValueT init_value,
623
+ NumItemsT num_items,
624
+ cudaStream_t stream = 0)
625
+ {
626
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
627
+
628
+ // Unsigned integer type for global offsets
629
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
630
+
631
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
632
+ d_temp_storage,
633
+ temp_storage_bytes,
634
+ d_in,
635
+ d_out,
636
+ scan_op,
637
+ detail::InputValue<InitValueT>(init_value),
638
+ num_items,
639
+ stream);
640
+ }
641
+
642
+ //! @rst
643
+ //! Computes a device-wide exclusive prefix scan using the specified
644
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
645
+ //! the initial value, and is assigned to ``*d_out``.
646
+ //!
647
+ //! - Supports non-commutative scan operators.
648
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
649
+ //! addition of floating-point types). Results for pseudo-associative
650
+ //! operators may vary from run to run. Additional details can be found in
651
+ //! the @lookback description.
652
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
653
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
654
+ //! shall not overlap in any other way.
655
+ //! - @devicestorage
656
+ //!
657
+ //! Snippet
658
+ //! +++++++++++++++++++++++++++++++++++++++++++++
659
+ //!
660
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
661
+ //! device vector of ``float`` data elements.
662
+ //!
663
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
664
+ //! :language: c++
665
+ //! :dedent:
666
+ //! :start-after: example-begin exclusive-scan-env-determinism
667
+ //! :end-before: example-end exclusive-scan-env-determinism
668
+ //!
669
+ //! @endrst
670
+ //!
671
+ //! @tparam InputIteratorT
672
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
673
+ //!
674
+ //! @tparam OutputIteratorT
675
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
676
+ //!
677
+ //! @tparam ScanOpT
678
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
679
+ //!
680
+ //! @tparam InitValueT
681
+ //! **[inferred]** Type of the `init_value`
682
+ //!
683
+ //! @tparam NumItemsT
684
+ //! **[inferred]** An integral type representing the number of input elements
685
+ //!
686
+ //! @tparam EnvT
687
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
688
+ //!
689
+ //! @param[in] d_in
690
+ //! Random-access iterator to the input sequence of data items
691
+ //!
692
+ //! @param[out] d_out
693
+ //! Random-access iterator to the output sequence of data items
694
+ //!
695
+ //! @param[in] scan_op
696
+ //! Binary associative scan functor
697
+ //!
698
+ //! @param[in] init_value
699
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
700
+ //!
701
+ //! @param[in] num_items
702
+ //! Total number of input items (i.e., the length of `d_in`)
703
+ //!
704
+ //! @param[in] env
705
+ //! @rst
706
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
707
+ //! @endrst
708
+ template <typename InputIteratorT,
709
+ typename OutputIteratorT,
710
+ typename ScanOpT,
711
+ typename InitValueT,
712
+ typename NumItemsT,
713
+ typename EnvT = _CUDA_STD_EXEC::env<>>
714
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
715
+ InputIteratorT d_in,
716
+ OutputIteratorT d_out,
717
+ ScanOpT scan_op,
718
+ InitValueT init_value,
719
+ NumItemsT num_items,
720
+ EnvT env = {})
721
+ {
722
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScan");
723
+
724
+ return scan_impl_env(d_in, d_out, scan_op, detail::InputValue<InitValueT>(init_value), num_items, env);
725
+ }
726
+
727
+ //! @rst
728
+ //! Computes a device-wide exclusive prefix scan using the specified
729
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
730
+ //! the initial value, and is assigned to ``*d_data``.
731
+ //!
732
+ //! - Supports non-commutative scan operators.
733
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
734
+ //! addition of floating-point types). Results for pseudo-associative
735
+ //! operators may vary from run to run. Additional details can be found in
736
+ //! the @lookback description.
737
+ //! - @devicestorage
738
+ //!
739
+ //! Snippet
740
+ //! +++++++++++++++++++++++++++++++++++++++++++++
741
+ //!
742
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
743
+ //! ``int`` device vector:
744
+ //!
745
+ //! .. code-block:: c++
746
+ //!
747
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
748
+ //! #include <cuda/std/climits> // for INT_MAX
749
+ //!
750
+ //! // CustomMin functor
751
+ //! struct CustomMin
752
+ //! {
753
+ //! template <typename T>
754
+ //! __host__ __device__ __forceinline__
755
+ //! T operator()(const T &a, const T &b) const {
756
+ //! return (b < a) ? b : a;
757
+ //! }
758
+ //! };
759
+ //!
760
+ //! // Declare, allocate, and initialize device-accessible pointers for
761
+ //! // input and output
762
+ //! int num_items; // e.g., 7
763
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
764
+ //! CustomMin min_op;
765
+ //! ...
766
+ //!
767
+ //! // Determine temporary device storage requirements for exclusive
768
+ //! // prefix scan
769
+ //! void *d_temp_storage = nullptr;
770
+ //! size_t temp_storage_bytes = 0;
771
+ //! cub::DeviceScan::ExclusiveScan(
772
+ //! d_temp_storage, temp_storage_bytes,
773
+ //! d_data, min_op, (int) INT_MAX, num_items);
774
+ //!
775
+ //! // Allocate temporary storage for exclusive prefix scan
776
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
777
+ //!
778
+ //! // Run exclusive prefix min-scan
779
+ //! cub::DeviceScan::ExclusiveScan(
780
+ //! d_temp_storage, temp_storage_bytes,
781
+ //! d_data, min_op, (int) INT_MAX, num_items);
782
+ //!
783
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
784
+ //!
785
+ //! @endrst
786
+ //!
787
+ //! @tparam IteratorT
788
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
789
+ //!
790
+ //! @tparam ScanOpT
791
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
792
+ //!
793
+ //! @tparam InitValueT
794
+ //! **[inferred]** Type of the `init_value`
795
+ //!
796
+ //! @tparam NumItemsT
797
+ //! **[inferred]** An integral type representing the number of input elements
798
+ //!
799
+ //! @param[in] d_temp_storage
800
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
801
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
802
+ //!
803
+ //! @param[in,out] temp_storage_bytes
804
+ //! Reference to size in bytes of `d_temp_storage` allocation
805
+ //!
806
+ //! @param[in,out] d_data
807
+ //! Random-access iterator to the sequence of data items
808
+ //!
809
+ //! @param[in] scan_op
810
+ //! Binary associative scan functor
811
+ //!
812
+ //! @param[in] init_value
813
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
814
+ //!
815
+ //! @param[in] num_items
816
+ //! Total number of input items (i.e., the length of `d_in`)
817
+ //!
818
+ //! @param[in] stream
819
+ //! @rst
820
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
821
+ //! @endrst
822
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
823
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
824
+ void* d_temp_storage,
825
+ size_t& temp_storage_bytes,
826
+ IteratorT d_data,
827
+ ScanOpT scan_op,
828
+ InitValueT init_value,
829
+ NumItemsT num_items,
830
+ cudaStream_t stream = 0)
831
+ {
832
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
833
+ }
834
+
835
+ //! @rst
836
+ //! Computes a device-wide exclusive prefix scan using the specified
837
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
838
+ //!
839
+ //! - Supports non-commutative scan operators.
840
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
841
+ //! addition of floating-point types). Results for pseudo-associative
842
+ //! operators may vary from run to run. Additional details can be found in
843
+ //! the @lookback description.
844
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
845
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
846
+ //! shall not overlap in any other way.
847
+ //! - @devicestorage
848
+ //!
849
+ //! Snippet
850
+ //! +++++++++++++++++++++++++++++++++++++++++++++
851
+ //!
852
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
853
+ //!
854
+ //! .. code-block:: c++
855
+ //!
856
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
857
+ //! #include <cuda/std/climits> // for INT_MAX
858
+ //!
859
+ //! // CustomMin functor
860
+ //! struct CustomMin
861
+ //! {
862
+ //! template <typename T>
863
+ //! __host__ __device__ __forceinline__
864
+ //! T operator()(const T &a, const T &b) const {
865
+ //! return (b < a) ? b : a;
866
+ //! }
867
+ //! };
868
+ //!
869
+ //! // Declare, allocate, and initialize device-accessible pointers for
870
+ //! // input and output
871
+ //! int num_items; // e.g., 7
872
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
873
+ //! int *d_out; // e.g., [ , , , , , , ]
874
+ //! int *d_init_iter; // e.g., INT_MAX
875
+ //! CustomMin min_op;
876
+ //!
877
+ //! auto future_init_value =
878
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
879
+ //!
880
+ //! ...
881
+ //!
882
+ //! // Determine temporary device storage requirements for exclusive
883
+ //! // prefix scan
884
+ //! void *d_temp_storage = nullptr;
885
+ //! size_t temp_storage_bytes = 0;
886
+ //! cub::DeviceScan::ExclusiveScan(
887
+ //! d_temp_storage, temp_storage_bytes,
888
+ //! d_in, d_out, min_op, future_init_value, num_items);
889
+ //!
890
+ //! // Allocate temporary storage for exclusive prefix scan
891
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
892
+ //!
893
+ //! // Run exclusive prefix min-scan
894
+ //! cub::DeviceScan::ExclusiveScan(
895
+ //! d_temp_storage, temp_storage_bytes,
896
+ //! d_in, d_out, min_op, future_init_value, num_items);
897
+ //!
898
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
899
+ //!
900
+ //! @endrst
901
+ //!
902
+ //! @tparam InputIteratorT
903
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
904
+ //!
905
+ //! @tparam OutputIteratorT
906
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
907
+ //!
908
+ //! @tparam ScanOpT
909
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
910
+ //!
911
+ //! @tparam InitValueT
912
+ //! **[inferred]** Type of the `init_value`
913
+ //!
914
+ //! @tparam NumItemsT
915
+ //! **[inferred]** An integral type representing the number of input elements
916
+ //!
917
+ //! @param[in] d_temp_storage
918
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
919
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
920
+ //!
921
+ //! @param[in,out] temp_storage_bytes
922
+ //! Reference to size in bytes of `d_temp_storage` allocation
923
+ //!
924
+ //! @param[in] d_in
925
+ //! Pointer to the input sequence of data items
926
+ //!
927
+ //! @param[out] d_out
928
+ //! Pointer to the output sequence of data items
929
+ //!
930
+ //! @param[in] scan_op
931
+ //! Binary associative scan functor
932
+ //!
933
+ //! @param[in] init_value
934
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
935
+ //!
936
+ //! @param[in] num_items
937
+ //! Total number of input items (i.e., the length of `d_in`)
938
+ //!
939
+ //! @param[in] stream
940
+ //! @rst
941
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
942
+ //! @endrst
943
+ template <typename InputIteratorT,
944
+ typename OutputIteratorT,
945
+ typename ScanOpT,
946
+ typename InitValueT,
947
+ typename InitValueIterT = InitValueT*,
948
+ typename NumItemsT = int>
949
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
950
+ void* d_temp_storage,
951
+ size_t& temp_storage_bytes,
952
+ InputIteratorT d_in,
953
+ OutputIteratorT d_out,
954
+ ScanOpT scan_op,
955
+ FutureValue<InitValueT, InitValueIterT> init_value,
956
+ NumItemsT num_items,
957
+ cudaStream_t stream = 0)
958
+ {
959
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
960
+
961
+ // Unsigned integer type for global offsets
962
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
963
+
964
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
965
+ d_temp_storage,
966
+ temp_storage_bytes,
967
+ d_in,
968
+ d_out,
969
+ scan_op,
970
+ detail::InputValue<InitValueT>(init_value),
971
+ num_items,
972
+ stream);
973
+ }
974
+
975
+ //! @rst
976
+ //! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
977
+ //! The ``init_value`` value is provided as a future value.
978
+ //!
979
+ //! - Supports non-commutative scan operators.
980
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
981
+ //! addition of floating-point types). Results for pseudo-associative
982
+ //! operators may vary from run to run. Additional details can be found in
983
+ //! the @lookback description.
984
+ //! - @devicestorage
985
+ //!
986
+ //! Snippet
987
+ //! +++++++++++++++++++++++++++++++++++++++++++++
988
+ //!
989
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
990
+ //!
991
+ //! .. code-block:: c++
992
+ //!
993
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
994
+ //! #include <cuda/std/climits> // for INT_MAX
995
+ //!
996
+ //! // CustomMin functor
997
+ //! struct CustomMin
998
+ //! {
999
+ //! template <typename T>
1000
+ //! __host__ __device__ __forceinline__
1001
+ //! T operator()(const T &a, const T &b) const {
1002
+ //! return (b < a) ? b : a;
1003
+ //! }
1004
+ //! };
1005
+ //!
1006
+ //! // Declare, allocate, and initialize device-accessible pointers for
1007
+ //! // input and output
1008
+ //! int num_items; // e.g., 7
1009
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1010
+ //! int *d_init_iter; // e.g., INT_MAX
1011
+ //! CustomMin min_op;
1012
+ //!
1013
+ //! auto future_init_value =
1014
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
1015
+ //!
1016
+ //! ...
1017
+ //!
1018
+ //! // Determine temporary device storage requirements for exclusive
1019
+ //! // prefix scan
1020
+ //! void *d_temp_storage = nullptr;
1021
+ //! size_t temp_storage_bytes = 0;
1022
+ //! cub::DeviceScan::ExclusiveScan(
1023
+ //! d_temp_storage, temp_storage_bytes,
1024
+ //! d_data, min_op, future_init_value, num_items);
1025
+ //!
1026
+ //! // Allocate temporary storage for exclusive prefix scan
1027
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1028
+ //!
1029
+ //! // Run exclusive prefix min-scan
1030
+ //! cub::DeviceScan::ExclusiveScan(
1031
+ //! d_temp_storage, temp_storage_bytes,
1032
+ //! d_data, min_op, future_init_value, num_items);
1033
+ //!
1034
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
1035
+ //!
1036
+ //! @endrst
1037
+ //!
1038
+ //! @tparam IteratorT
1039
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1040
+ //!
1041
+ //! @tparam ScanOpT
1042
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1043
+ //!
1044
+ //! @tparam InitValueT
1045
+ //! **[inferred]** Type of the `init_value`
1046
+ //!
1047
+ //! @tparam NumItemsT
1048
+ //! **[inferred]** An integral type representing the number of input elements
1049
+ //!
1050
+ //! @param[in] d_temp_storage
1051
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1052
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1053
+ //!
1054
+ //! @param[in,out] temp_storage_bytes
1055
+ //! Reference to size in bytes of `d_temp_storage` allocation
1056
+ //!
1057
+ //! @param[in,out] d_data
1058
+ //! Pointer to the sequence of data items
1059
+ //!
1060
+ //! @param[in] scan_op
1061
+ //! Binary associative scan functor
1062
+ //!
1063
+ //! @param[in] init_value
1064
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
1065
+ //!
1066
+ //! @param[in] num_items
1067
+ //! Total number of input items (i.e., the length of `d_in`)
1068
+ //!
1069
+ //! @param[in] stream
1070
+ //! @rst
1071
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1072
+ //! @endrst
1073
+ template <typename IteratorT,
1074
+ typename ScanOpT,
1075
+ typename InitValueT,
1076
+ typename InitValueIterT = InitValueT*,
1077
+ typename NumItemsT = int>
1078
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
1079
+ void* d_temp_storage,
1080
+ size_t& temp_storage_bytes,
1081
+ IteratorT d_data,
1082
+ ScanOpT scan_op,
1083
+ FutureValue<InitValueT, InitValueIterT> init_value,
1084
+ NumItemsT num_items,
1085
+ cudaStream_t stream = 0)
1086
+ {
1087
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
1088
+ }
1089
+
1090
+ //! @} end member group
1091
+
1092
+ //! @name Inclusive scans
1093
+ //! @{
1094
+
1095
+ //! @rst
1096
+ //! Computes a device-wide inclusive prefix sum.
1097
+ //!
1098
+ //! - Supports non-commutative sum operators.
1099
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1100
+ //! addition of floating-point types). Results for pseudo-associative
1101
+ //! operators may vary from run to run. Additional details can be found in
1102
+ //! the @lookback description.
1103
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1104
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1105
+ //! shall not overlap in any other way.
1106
+ //! - @devicestorage
1107
+ //!
1108
+ //! Snippet
1109
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1110
+ //!
1111
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1112
+ //!
1113
+ //! .. code-block:: c++
1114
+ //!
1115
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1116
+ //!
1117
+ //! // Declare, allocate, and initialize device-accessible pointers for
1118
+ //! // input and output
1119
+ //! int num_items; // e.g., 7
1120
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1121
+ //! int *d_out; // e.g., [ , , , , , , ]
1122
+ //! ...
1123
+ //!
1124
+ //! // Determine temporary device storage requirements for inclusive
1125
+ //! // prefix sum
1126
+ //! void *d_temp_storage = nullptr;
1127
+ //! size_t temp_storage_bytes = 0;
1128
+ //! cub::DeviceScan::InclusiveSum(
1129
+ //! d_temp_storage, temp_storage_bytes,
1130
+ //! d_in, d_out, num_items);
1131
+ //!
1132
+ //! // Allocate temporary storage for inclusive prefix sum
1133
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1134
+ //!
1135
+ //! // Run inclusive prefix sum
1136
+ //! cub::DeviceScan::InclusiveSum(
1137
+ //! d_temp_storage, temp_storage_bytes,
1138
+ //! d_in, d_out, num_items);
1139
+ //!
1140
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
1141
+ //!
1142
+ //! @endrst
1143
+ //!
1144
+ //! @tparam InputIteratorT
1145
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1146
+ //!
1147
+ //! @tparam OutputIteratorT
1148
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1149
+ //!
1150
+ //! @tparam NumItemsT
1151
+ //! **[inferred]** An integral type representing the number of input elements
1152
+ //!
1153
+ //! @param[in] d_temp_storage
1154
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1155
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1156
+ //!
1157
+ //! @param[in,out] temp_storage_bytes
1158
+ //! Reference to size in bytes of `d_temp_storage` allocation
1159
+ //!
1160
+ //! @param[in] d_in
1161
+ //! Random-access iterator to the input sequence of data items
1162
+ //!
1163
+ //! @param[out] d_out
1164
+ //! Random-access iterator to the output sequence of data items
1165
+ //!
1166
+ //! @param[in] num_items
1167
+ //! Total number of input items (i.e., the length of `d_in`)
1168
+ //!
1169
+ //! @param[in] stream
1170
+ //! @rst
1171
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1172
+ //! @endrst
1173
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1174
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1175
+ void* d_temp_storage,
1176
+ size_t& temp_storage_bytes,
1177
+ InputIteratorT d_in,
1178
+ OutputIteratorT d_out,
1179
+ NumItemsT num_items,
1180
+ cudaStream_t stream = 0)
1181
+ {
1182
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
1183
+
1184
+ // Unsigned integer type for global offsets
1185
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1186
+
1187
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
1188
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
1189
+ }
1190
+
1191
+ //! @rst
1192
+ //! Computes a device-wide inclusive prefix sum in-place.
1193
+ //!
1194
+ //! - Supports non-commutative sum operators.
1195
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1196
+ //! addition of floating-point types). Results for pseudo-associative
1197
+ //! operators may vary from run to run. Additional details can be found in
1198
+ //! the @lookback description.
1199
+ //! - @devicestorage
1200
+ //!
1201
+ //! Snippet
1202
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1203
+ //!
1204
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1205
+ //!
1206
+ //! .. code-block:: c++
1207
+ //!
1208
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1209
+ //!
1210
+ //! // Declare, allocate, and initialize device-accessible pointers for
1211
+ //! // input and output
1212
+ //! int num_items; // e.g., 7
1213
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1214
+ //! ...
1215
+ //!
1216
+ //! // Determine temporary device storage requirements for inclusive
1217
+ //! // prefix sum
1218
+ //! void *d_temp_storage = nullptr;
1219
+ //! size_t temp_storage_bytes = 0;
1220
+ //! cub::DeviceScan::InclusiveSum(
1221
+ //! d_temp_storage, temp_storage_bytes,
1222
+ //! d_data, num_items);
1223
+ //!
1224
+ //! // Allocate temporary storage for inclusive prefix sum
1225
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1226
+ //!
1227
+ //! // Run inclusive prefix sum
1228
+ //! cub::DeviceScan::InclusiveSum(
1229
+ //! d_temp_storage, temp_storage_bytes,
1230
+ //! d_data, num_items);
1231
+ //!
1232
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
1233
+ //!
1234
+ //! @endrst
1235
+ //!
1236
+ //! @tparam IteratorT
1237
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1238
+ //!
1239
+ //! @tparam NumItemsT
1240
+ //! **[inferred]** An integral type representing the number of input elements
1241
+ //!
1242
+ //! @param[in] d_temp_storage
1243
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1244
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1245
+ //!
1246
+ //! @param[in,out] temp_storage_bytes
1247
+ //! Reference to size in bytes of `d_temp_storage` allocation
1248
+ //!
1249
+ //! @param[in,out] d_data
1250
+ //! Random-access iterator to the sequence of data items
1251
+ //!
1252
+ //! @param[in] num_items
1253
+ //! Total number of input items (i.e., the length of `d_in`)
1254
+ //!
1255
+ //! @param[in] stream
1256
+ //! @rst
1257
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1258
+ //! @endrst
1259
+ template <typename IteratorT, typename NumItemsT>
1260
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1261
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
1262
+ {
1263
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
1264
+ }
1265
+
1266
+ //! @rst
1267
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1268
+ //!
1269
+ //! - Supports non-commutative scan operators.
1270
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1271
+ //! addition of floating-point types). Results for pseudo-associative
1272
+ //! operators may vary from run to run. Additional details can be found in
1273
+ //! the @lookback description.
1274
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1275
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1276
+ //! shall not overlap in any other way.
1277
+ //! - @devicestorage
1278
+ //!
1279
+ //! Snippet
1280
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1281
+ //!
1282
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1283
+ //!
1284
+ //! .. code-block:: c++
1285
+ //!
1286
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1287
+ //! #include <cuda/std/climits> // for INT_MAX
1288
+ //!
1289
+ //! // CustomMin functor
1290
+ //! struct CustomMin
1291
+ //! {
1292
+ //! template <typename T>
1293
+ //! __host__ __device__ __forceinline__
1294
+ //! T operator()(const T &a, const T &b) const {
1295
+ //! return (b < a) ? b : a;
1296
+ //! }
1297
+ //! };
1298
+ //!
1299
+ //! // Declare, allocate, and initialize device-accessible pointers for
1300
+ //! // input and output
1301
+ //! int num_items; // e.g., 7
1302
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1303
+ //! int *d_out; // e.g., [ , , , , , , ]
1304
+ //! CustomMin min_op;
1305
+ //! ...
1306
+ //!
1307
+ //! // Determine temporary device storage requirements for inclusive
1308
+ //! // prefix scan
1309
+ //! void *d_temp_storage = nullptr;
1310
+ //! size_t temp_storage_bytes = 0;
1311
+ //! cub::DeviceScan::InclusiveScan(
1312
+ //! d_temp_storage, temp_storage_bytes,
1313
+ //! d_in, d_out, min_op, num_items);
1314
+ //!
1315
+ //! // Allocate temporary storage for inclusive prefix scan
1316
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1317
+ //!
1318
+ //! // Run inclusive prefix min-scan
1319
+ //! cub::DeviceScan::InclusiveScan(
1320
+ //! d_temp_storage, temp_storage_bytes,
1321
+ //! d_in, d_out, min_op, num_items);
1322
+ //!
1323
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1324
+ //!
1325
+ //! @endrst
1326
+ //!
1327
+ //! @tparam InputIteratorT
1328
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1329
+ //!
1330
+ //! @tparam OutputIteratorT
1331
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1332
+ //!
1333
+ //! @tparam ScanOpT
1334
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1335
+ //!
1336
+ //! @tparam NumItemsT
1337
+ //! **[inferred]** An integral type representing the number of input elements
1338
+ //!
1339
+ //! @param[in]
1340
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1341
+ //! When `nullptr`, the required allocation size is written to
1342
+ //! `temp_storage_bytes` and no work is done.
1343
+ //!
1344
+ //! @param[in,out] temp_storage_bytes
1345
+ //! Reference to size in bytes of `d_temp_storage` allocation
1346
+ //!
1347
+ //! @param[in] d_in
1348
+ //! Random-access iterator to the input sequence of data items
1349
+ //!
1350
+ //! @param[out] d_out
1351
+ //! Random-access iterator to the output sequence of data items
1352
+ //!
1353
+ //! @param[in] scan_op
1354
+ //! Binary associative scan functor
1355
+ //!
1356
+ //! @param[in] num_items
1357
+ //! Total number of input items (i.e., the length of `d_in`)
1358
+ //!
1359
+ //! @param[in] stream
1360
+ //! @rst
1361
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1362
+ //! @endrst
1363
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1364
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1365
+ void* d_temp_storage,
1366
+ size_t& temp_storage_bytes,
1367
+ InputIteratorT d_in,
1368
+ OutputIteratorT d_out,
1369
+ ScanOpT scan_op,
1370
+ NumItemsT num_items,
1371
+ cudaStream_t stream = 0)
1372
+ {
1373
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1374
+
1375
+ // Unsigned integer type for global offsets
1376
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1377
+
1378
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1379
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1380
+ }
1381
+
1382
+ //! @rst
1383
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1384
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1385
+ //! is assigned to ``*d_out``.
1386
+ //!
1387
+ //! - Supports non-commutative scan operators.
1388
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1389
+ //! addition of floating-point types). Results for pseudo-associative
1390
+ //! operators may vary from run to run. Additional details can be found in
1391
+ //! the @lookback description.
1392
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1393
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1394
+ //! shall not overlap in any other way.
1395
+ //! - @devicestorage
1396
+ //!
1397
+ //! Snippet
1398
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1399
+ //!
1400
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1401
+ //!
1402
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1403
+ //! :language: c++
1404
+ //! :dedent:
1405
+ //! :start-after: example-begin device-inclusive-scan
1406
+ //! :end-before: example-end device-inclusive-scan
1407
+ //!
1408
+ //! @endrst
1409
+ //!
1410
+ //! @tparam InputIteratorT
1411
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1412
+ //!
1413
+ //! @tparam OutputIteratorT
1414
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1415
+ //!
1416
+ //! @tparam ScanOpT
1417
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1418
+ //!
1419
+ //! @tparam InitValueT
1420
+ //! **[inferred]** Type of the `init_value`
1421
+ //!
1422
+ //! @tparam NumItemsT
1423
+ //! **[inferred]** An integral type representing the number of input elements
1424
+ //!
1425
+ //! @param[in] d_temp_storage
1426
+ //! Device-accessible allocation of temporary storage.
1427
+ //! When `nullptr`, the required allocation size is written to
1428
+ //! `temp_storage_bytes` and no work is done.
1429
+ //!
1430
+ //! @param[in,out] temp_storage_bytes
1431
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1432
+ //!
1433
+ //! @param[in] d_in
1434
+ //! Random-access iterator to the input sequence of data items
1435
+ //!
1436
+ //! @param[out] d_out
1437
+ //! Random-access iterator to the output sequence of data items
1438
+ //!
1439
+ //! @param[in] scan_op
1440
+ //! Binary associative scan functor
1441
+ //!
1442
+ //! @param[in] init_value
1443
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1444
+ //! is assigned to `*d_out`)
1445
+ //!
1446
+ //! @param[in] num_items
1447
+ //! Total number of input items (i.e., the length of `d_in`)
1448
+ //!
1449
+ //! @param[in] stream
1450
+ //! CUDA stream to launch kernels within.
1451
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1452
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1453
+ void* d_temp_storage,
1454
+ size_t& temp_storage_bytes,
1455
+ InputIteratorT d_in,
1456
+ OutputIteratorT d_out,
1457
+ ScanOpT scan_op,
1458
+ InitValueT init_value,
1459
+ NumItemsT num_items,
1460
+ cudaStream_t stream = 0)
1461
+ {
1462
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1463
+
1464
+ // Unsigned integer type for global offsets
1465
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1466
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1467
+
1468
+ return DispatchScan<
1469
+ InputIteratorT,
1470
+ OutputIteratorT,
1471
+ ScanOpT,
1472
+ detail::InputValue<InitValueT>,
1473
+ OffsetT,
1474
+ AccumT,
1475
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1476
+ temp_storage_bytes,
1477
+ d_in,
1478
+ d_out,
1479
+ scan_op,
1480
+ detail::InputValue<InitValueT>(init_value),
1481
+ num_items,
1482
+ stream);
1483
+ }
1484
+
1485
+ //! @rst
1486
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1487
+ //!
1488
+ //! - Supports non-commutative scan operators.
1489
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1490
+ //! addition of floating-point types). Results for pseudo-associative
1491
+ //! operators may vary from run to run. Additional details can be found in
1492
+ //! the @lookback description.
1493
+ //! - @devicestorage
1494
+ //!
1495
+ //! Snippet
1496
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1497
+ //!
1498
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1499
+ //!
1500
+ //! .. code-block:: c++
1501
+ //!
1502
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1503
+ //! #include <cuda/std/climits> // for INT_MAX
1504
+ //!
1505
+ //! // CustomMin functor
1506
+ //! struct CustomMin
1507
+ //! {
1508
+ //! template <typename T>
1509
+ //! __host__ __device__ __forceinline__
1510
+ //! T operator()(const T &a, const T &b) const {
1511
+ //! return (b < a) ? b : a;
1512
+ //! }
1513
+ //! };
1514
+ //!
1515
+ //! // Declare, allocate, and initialize device-accessible pointers for
1516
+ //! // input and output
1517
+ //! int num_items; // e.g., 7
1518
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1519
+ //! CustomMin min_op;
1520
+ //! ...
1521
+ //!
1522
+ //! // Determine temporary device storage requirements for inclusive
1523
+ //! // prefix scan
1524
+ //! void *d_temp_storage = nullptr;
1525
+ //! size_t temp_storage_bytes = 0;
1526
+ //! cub::DeviceScan::InclusiveScan(
1527
+ //! d_temp_storage, temp_storage_bytes,
1528
+ //! d_data, min_op, num_items);
1529
+ //!
1530
+ //! // Allocate temporary storage for inclusive prefix scan
1531
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1532
+ //!
1533
+ //! // Run inclusive prefix min-scan
1534
+ //! cub::DeviceScan::InclusiveScan(
1535
+ //! d_temp_storage, temp_storage_bytes,
1536
+ //! d_in, d_out, min_op, num_items);
1537
+ //!
1538
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1539
+ //!
1540
+ //! @endrst
1541
+ //!
1542
+ //! @tparam IteratorT
1543
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1544
+ //!
1545
+ //! @tparam ScanOpT
1546
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1547
+ //!
1548
+ //! @tparam NumItemsT
1549
+ //! **[inferred]** An integral type representing the number of input elements
1550
+ //!
1551
+ //! @param[in]
1552
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1553
+ //! When `nullptr`, the required allocation size is written to
1554
+ //! `temp_storage_bytes` and no work is done.
1555
+ //!
1556
+ //! @param[in,out] temp_storage_bytes
1557
+ //! Reference to size in bytes of `d_temp_storage` allocation
1558
+ //!
1559
+ //! @param[in] d_data
1560
+ //! Random-access iterator to the sequence of data items
1561
+ //!
1562
+ //! @param[in] scan_op
1563
+ //! Binary associative scan functor
1564
+ //!
1565
+ //! @param[in] num_items
1566
+ //! Total number of input items (i.e., the length of `d_in`)
1567
+ //!
1568
+ //! @param[in] stream
1569
+ //! @rst
1570
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1571
+ //! @endrst
1572
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1573
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1574
+ void* d_temp_storage,
1575
+ size_t& temp_storage_bytes,
1576
+ IteratorT d_data,
1577
+ ScanOpT scan_op,
1578
+ NumItemsT num_items,
1579
+ cudaStream_t stream = 0)
1580
+ {
1581
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1582
+ }
1583
+ //! @} end member group
1584
+
1585
+ //! @name Scans by key
1586
+ //! @{
1587
+
1588
+ //! @rst
1589
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1590
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1591
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1592
+ //!
1593
+ //! - Supports non-commutative sum operators.
1594
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1595
+ //! addition of floating-point types). Results for pseudo-associative
1596
+ //! operators may vary from run to run. Additional details can be found in
1597
+ //! the @lookback description.
1598
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1599
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1600
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1601
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1602
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1603
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1604
+ //! - @devicestorage
1605
+ //!
1606
+ //! Snippet
1607
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1608
+ //!
1609
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1610
+ //!
1611
+ //! .. code-block:: c++
1612
+ //!
1613
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1614
+ //!
1615
+ //! // Declare, allocate, and initialize device-accessible pointers for
1616
+ //! // input and output
1617
+ //! int num_items; // e.g., 7
1618
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1619
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1620
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1621
+ //! ...
1622
+ //!
1623
+ //! // Determine temporary device storage requirements
1624
+ //! void *d_temp_storage = nullptr;
1625
+ //! size_t temp_storage_bytes = 0;
1626
+ //! cub::DeviceScan::ExclusiveSumByKey(
1627
+ //! d_temp_storage, temp_storage_bytes,
1628
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1629
+ //!
1630
+ //! // Allocate temporary storage
1631
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1632
+ //!
1633
+ //! // Run exclusive prefix sum
1634
+ //! cub::DeviceScan::ExclusiveSumByKey(
1635
+ //! d_temp_storage, temp_storage_bytes,
1636
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1637
+ //!
1638
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1639
+ //!
1640
+ //! @endrst
1641
+ //!
1642
+ //! @tparam KeysInputIteratorT
1643
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1644
+ //!
1645
+ //! @tparam ValuesInputIteratorT
1646
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1647
+ //!
1648
+ //! @tparam ValuesOutputIteratorT
1649
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1650
+ //!
1651
+ //! @tparam EqualityOpT
1652
+ //! **[inferred]** Functor type having member
1653
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1654
+ //!
1655
+ //! @tparam NumItemsT
1656
+ //! **[inferred]** An integral type representing the number of input elements
1657
+ //!
1658
+ //! @param[in] d_temp_storage
1659
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1660
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1661
+ //!
1662
+ //! @param[in,out] temp_storage_bytes
1663
+ //! Reference to size in bytes of `d_temp_storage` allocation
1664
+ //!
1665
+ //! @param[in] d_keys_in
1666
+ //! Random-access input iterator to the input sequence of key items
1667
+ //!
1668
+ //! @param[in] d_values_in
1669
+ //! Random-access input iterator to the input sequence of value items
1670
+ //!
1671
+ //! @param[out] d_values_out
1672
+ //! Random-access output iterator to the output sequence of value items
1673
+ //!
1674
+ //! @param[in] num_items
1675
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1676
+ //!
1677
+ //! @param[in] equality_op
1678
+ //! Binary functor that defines the equality of keys.
1679
+ //! Default is cuda::std::equal_to<>{}.
1680
+ //!
1681
+ //! @param[in] stream
1682
+ //! @rst
1683
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1684
+ //! @endrst
1685
+ template <typename KeysInputIteratorT,
1686
+ typename ValuesInputIteratorT,
1687
+ typename ValuesOutputIteratorT,
1688
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1689
+ typename NumItemsT = uint32_t>
1690
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1691
+ void* d_temp_storage,
1692
+ size_t& temp_storage_bytes,
1693
+ KeysInputIteratorT d_keys_in,
1694
+ ValuesInputIteratorT d_values_in,
1695
+ ValuesOutputIteratorT d_values_out,
1696
+ NumItemsT num_items,
1697
+ EqualityOpT equality_op = EqualityOpT(),
1698
+ cudaStream_t stream = 0)
1699
+ {
1700
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1701
+
1702
+ // Unsigned integer type for global offsets
1703
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1704
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1705
+
1706
+ // Initial value
1707
+ InitT init_value{};
1708
+
1709
+ return DispatchScanByKey<
1710
+ KeysInputIteratorT,
1711
+ ValuesInputIteratorT,
1712
+ ValuesOutputIteratorT,
1713
+ EqualityOpT,
1714
+ ::cuda::std::plus<>,
1715
+ InitT,
1716
+ OffsetT>::Dispatch(d_temp_storage,
1717
+ temp_storage_bytes,
1718
+ d_keys_in,
1719
+ d_values_in,
1720
+ d_values_out,
1721
+ equality_op,
1722
+ ::cuda::std::plus<>{},
1723
+ init_value,
1724
+ num_items,
1725
+ stream);
1726
+ }
1727
+
1728
+ //! @rst
1729
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1730
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by
1731
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1732
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1733
+ //!
1734
+ //! - Supports non-commutative scan operators.
1735
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1736
+ //! addition of floating-point types). Results for pseudo-associative
1737
+ //! operators may vary from run to run. Additional details can be found in
1738
+ //! the @lookback description.
1739
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1740
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1741
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1742
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1743
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1744
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1745
+ //! - @devicestorage
1746
+ //!
1747
+ //! Snippet
1748
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1749
+ //!
1750
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1751
+ //!
1752
+ //! .. code-block:: c++
1753
+ //!
1754
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1755
+ //! #include <cuda/std/climits> // for INT_MAX
1756
+ //!
1757
+ //! // CustomMin functor
1758
+ //! struct CustomMin
1759
+ //! {
1760
+ //! template <typename T>
1761
+ //! __host__ __device__ __forceinline__
1762
+ //! T operator()(const T &a, const T &b) const {
1763
+ //! return (b < a) ? b : a;
1764
+ //! }
1765
+ //! };
1766
+ //!
1767
+ //! // CustomEqual functor
1768
+ //! struct CustomEqual
1769
+ //! {
1770
+ //! template <typename T>
1771
+ //! __host__ __device__ __forceinline__
1772
+ //! T operator()(const T &a, const T &b) const {
1773
+ //! return a == b;
1774
+ //! }
1775
+ //! };
1776
+ //!
1777
+ //! // Declare, allocate, and initialize device-accessible pointers for
1778
+ //! // input and output
1779
+ //! int num_items; // e.g., 7
1780
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1781
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1782
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1783
+ //! CustomMin min_op;
1784
+ //! CustomEqual equality_op;
1785
+ //! ...
1786
+ //!
1787
+ //! // Determine temporary device storage requirements for exclusive
1788
+ //! // prefix scan
1789
+ //! void *d_temp_storage = nullptr;
1790
+ //! size_t temp_storage_bytes = 0;
1791
+ //! cub::DeviceScan::ExclusiveScanByKey(
1792
+ //! d_temp_storage, temp_storage_bytes,
1793
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1794
+ //! (int) INT_MAX, num_items, equality_op);
1795
+ //!
1796
+ //! // Allocate temporary storage for exclusive prefix scan
1797
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1798
+ //!
1799
+ //! // Run exclusive prefix min-scan
1800
+ //! cub::DeviceScan::ExclusiveScanByKey(
1801
+ //! d_temp_storage, temp_storage_bytes,
1802
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1803
+ //! (int) INT_MAX, num_items, equality_op);
1804
+ //!
1805
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1806
+ //!
1807
+ //! @endrst
1808
+ //!
1809
+ //! @tparam KeysInputIteratorT
1810
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1811
+ //!
1812
+ //! @tparam ValuesInputIteratorT
1813
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1814
+ //!
1815
+ //! @tparam ValuesOutputIteratorT
1816
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1817
+ //!
1818
+ //! @tparam ScanOpT
1819
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1820
+ //!
1821
+ //! @tparam InitValueT
1822
+ //! **[inferred]** Type of the `init_value`
1823
+ //!
1824
+ //! @tparam EqualityOpT
1825
+ //! **[inferred]** Functor type having member
1826
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1827
+ //!
1828
+ //! @tparam NumItemsT
1829
+ //! **[inferred]** An integral type representing the number of input elements
1830
+ //!
1831
+ //! @param[in] d_temp_storage
1832
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1833
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1834
+ //!
1835
+ //! @param[in,out] temp_storage_bytes
1836
+ //! Reference to size in bytes of `d_temp_storage` allocation
1837
+ //!
1838
+ //! @param[in] d_keys_in
1839
+ //! Random-access input iterator to the input sequence of key items
1840
+ //!
1841
+ //! @param[in] d_values_in
1842
+ //! Random-access input iterator to the input sequence of value items
1843
+ //!
1844
+ //! @param[out] d_values_out
1845
+ //! Random-access output iterator to the output sequence of value items
1846
+ //!
1847
+ //! @param[in] scan_op
1848
+ //! Binary associative scan functor
1849
+ //!
1850
+ //! @param[in] init_value
1851
+ //! Initial value to seed the exclusive scan (and is assigned to the
1852
+ //! beginning of each segment in `d_values_out`)
1853
+ //!
1854
+ //! @param[in] num_items
1855
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1856
+ //! `d_values_in`)
1857
+ //!
1858
+ //! @param[in] equality_op
1859
+ //! Binary functor that defines the equality of keys.
1860
+ //! Default is cuda::std::equal_to<>{}.
1861
+ //!
1862
+ //! @param[in] stream
1863
+ //! @rst
1864
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1865
+ //! @endrst
1866
+ template <typename KeysInputIteratorT,
1867
+ typename ValuesInputIteratorT,
1868
+ typename ValuesOutputIteratorT,
1869
+ typename ScanOpT,
1870
+ typename InitValueT,
1871
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1872
+ typename NumItemsT = uint32_t>
1873
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1874
+ void* d_temp_storage,
1875
+ size_t& temp_storage_bytes,
1876
+ KeysInputIteratorT d_keys_in,
1877
+ ValuesInputIteratorT d_values_in,
1878
+ ValuesOutputIteratorT d_values_out,
1879
+ ScanOpT scan_op,
1880
+ InitValueT init_value,
1881
+ NumItemsT num_items,
1882
+ EqualityOpT equality_op = EqualityOpT(),
1883
+ cudaStream_t stream = 0)
1884
+ {
1885
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1886
+
1887
+ // Unsigned integer type for global offsets
1888
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1889
+
1890
+ return DispatchScanByKey<
1891
+ KeysInputIteratorT,
1892
+ ValuesInputIteratorT,
1893
+ ValuesOutputIteratorT,
1894
+ EqualityOpT,
1895
+ ScanOpT,
1896
+ InitValueT,
1897
+ OffsetT>::Dispatch(d_temp_storage,
1898
+ temp_storage_bytes,
1899
+ d_keys_in,
1900
+ d_values_in,
1901
+ d_values_out,
1902
+ equality_op,
1903
+ scan_op,
1904
+ init_value,
1905
+ num_items,
1906
+ stream);
1907
+ }
1908
+
1909
+ //! @rst
1910
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1911
+ //!
1912
+ //! - Supports non-commutative sum operators.
1913
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1914
+ //! addition of floating-point types). Results for pseudo-associative
1915
+ //! operators may vary from run to run. Additional details can be found in
1916
+ //! the @lookback description.
1917
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1918
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1919
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1920
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1921
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1922
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1923
+ //! - @devicestorage
1924
+ //!
1925
+ //! Snippet
1926
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1927
+ //!
1928
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1929
+ //!
1930
+ //! .. code-block:: c++
1931
+ //!
1932
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1933
+ //!
1934
+ //! // Declare, allocate, and initialize device-accessible pointers for
1935
+ //! // input and output
1936
+ //! int num_items; // e.g., 7
1937
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1938
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1939
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1940
+ //! ...
1941
+ //!
1942
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1943
+ //! void *d_temp_storage = nullptr;
1944
+ //! size_t temp_storage_bytes = 0;
1945
+ //! cub::DeviceScan::InclusiveSumByKey(
1946
+ //! d_temp_storage, temp_storage_bytes,
1947
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1948
+ //!
1949
+ //! // Allocate temporary storage for inclusive prefix sum
1950
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1951
+ //!
1952
+ //! // Run inclusive prefix sum
1953
+ //! cub::DeviceScan::InclusiveSumByKey(
1954
+ //! d_temp_storage, temp_storage_bytes,
1955
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1956
+ //!
1957
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1958
+ //!
1959
+ //! @endrst
1960
+ //!
1961
+ //! @tparam KeysInputIteratorT
1962
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1963
+ //!
1964
+ //! @tparam ValuesInputIteratorT
1965
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1966
+ //!
1967
+ //! @tparam ValuesOutputIteratorT
1968
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1969
+ //!
1970
+ //! @tparam EqualityOpT
1971
+ //! **[inferred]** Functor type having member
1972
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1973
+ //!
1974
+ //! @tparam NumItemsT
1975
+ //! **[inferred]** An integral type representing the number of input elements
1976
+ //!
1977
+ //! @param[in] d_temp_storage
1978
+ //! Device-accessible allocation of temporary storage.
1979
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1980
+ //!
1981
+ //! @param[in,out] temp_storage_bytes
1982
+ //! Reference to size in bytes of `d_temp_storage` allocation
1983
+ //!
1984
+ //! @param[in] d_keys_in
1985
+ //! Random-access input iterator to the input sequence of key items
1986
+ //!
1987
+ //! @param[in] d_values_in
1988
+ //! Random-access input iterator to the input sequence of value items
1989
+ //!
1990
+ //! @param[out] d_values_out
1991
+ //! Random-access output iterator to the output sequence of value items
1992
+ //!
1993
+ //! @param[in] num_items
1994
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1995
+ //!
1996
+ //! @param[in] equality_op
1997
+ //! Binary functor that defines the equality of keys.
1998
+ //! Default is cuda::std::equal_to<>{}.
1999
+ //!
2000
+ //! @param[in] stream
2001
+ //! @rst
2002
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2003
+ //! @endrst
2004
+ template <typename KeysInputIteratorT,
2005
+ typename ValuesInputIteratorT,
2006
+ typename ValuesOutputIteratorT,
2007
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2008
+ typename NumItemsT = uint32_t>
2009
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
2010
+ void* d_temp_storage,
2011
+ size_t& temp_storage_bytes,
2012
+ KeysInputIteratorT d_keys_in,
2013
+ ValuesInputIteratorT d_values_in,
2014
+ ValuesOutputIteratorT d_values_out,
2015
+ NumItemsT num_items,
2016
+ EqualityOpT equality_op = EqualityOpT(),
2017
+ cudaStream_t stream = 0)
2018
+ {
2019
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
2020
+
2021
+ // Unsigned integer type for global offsets
2022
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2023
+
2024
+ return DispatchScanByKey<
2025
+ KeysInputIteratorT,
2026
+ ValuesInputIteratorT,
2027
+ ValuesOutputIteratorT,
2028
+ EqualityOpT,
2029
+ ::cuda::std::plus<>,
2030
+ NullType,
2031
+ OffsetT>::Dispatch(d_temp_storage,
2032
+ temp_storage_bytes,
2033
+ d_keys_in,
2034
+ d_values_in,
2035
+ d_values_out,
2036
+ equality_op,
2037
+ ::cuda::std::plus<>{},
2038
+ NullType{},
2039
+ num_items,
2040
+ stream);
2041
+ }
2042
+
2043
+ //! @rst
2044
+ //! Computes a device-wide inclusive prefix scan-by-key using the
2045
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
2046
+ //!
2047
+ //! - Supports non-commutative scan operators.
2048
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
2049
+ //! addition of floating-point types). Results for pseudo-associative
2050
+ //! operators may vary from run to run. Additional details can be found in
2051
+ //! the @lookback description.
2052
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
2053
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
2054
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2055
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
2056
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
2057
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2058
+ //! - @devicestorage
2059
+ //!
2060
+ //! Snippet
2061
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2062
+ //!
2063
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
2064
+ //!
2065
+ //! .. code-block:: c++
2066
+ //!
2067
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
2068
+ //! #include <cuda/std/climits> // for INT_MAX
2069
+ //!
2070
+ //! // CustomMin functor
2071
+ //! struct CustomMin
2072
+ //! {
2073
+ //! template <typename T>
2074
+ //! __host__ __device__ __forceinline__
2075
+ //! T operator()(const T &a, const T &b) const {
2076
+ //! return (b < a) ? b : a;
2077
+ //! }
2078
+ //! };
2079
+ //!
2080
+ //! // CustomEqual functor
2081
+ //! struct CustomEqual
2082
+ //! {
2083
+ //! template <typename T>
2084
+ //! __host__ __device__ __forceinline__
2085
+ //! T operator()(const T &a, const T &b) const {
2086
+ //! return a == b;
2087
+ //! }
2088
+ //! };
2089
+ //!
2090
+ //! // Declare, allocate, and initialize device-accessible pointers for
2091
+ //! // input and output
2092
+ //! int num_items; // e.g., 7
2093
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
2094
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2095
+ //! int *d_values_out; // e.g., [ , , , , , , ]
2096
+ //! CustomMin min_op;
2097
+ //! CustomEqual equality_op;
2098
+ //! ...
2099
+ //!
2100
+ //! // Determine temporary device storage requirements for inclusive prefix scan
2101
+ //! void *d_temp_storage = nullptr;
2102
+ //! size_t temp_storage_bytes = 0;
2103
+ //! cub::DeviceScan::InclusiveScanByKey(
2104
+ //! d_temp_storage, temp_storage_bytes,
2105
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2106
+ //!
2107
+ //! // Allocate temporary storage for inclusive prefix scan
2108
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2109
+ //!
2110
+ //! // Run inclusive prefix min-scan
2111
+ //! cub::DeviceScan::InclusiveScanByKey(
2112
+ //! d_temp_storage, temp_storage_bytes,
2113
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2114
+ //!
2115
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
2116
+ //!
2117
+ //! @endrst
2118
+ //!
2119
+ //! @tparam KeysInputIteratorT
2120
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
2121
+ //!
2122
+ //! @tparam ValuesInputIteratorT
2123
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
2124
+ //!
2125
+ //! @tparam ValuesOutputIteratorT
2126
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
2127
+ //!
2128
+ //! @tparam ScanOpT
2129
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
2130
+ //!
2131
+ //! @tparam EqualityOpT
2132
+ //! **[inferred]** Functor type having member
2133
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
2134
+ //!
2135
+ //! @tparam NumItemsT
2136
+ //! **[inferred]** An integral type representing the number of input elements
2137
+ //!
2138
+ //! @param[in] d_temp_storage
2139
+ //! Device-accessible allocation of temporary storage.
2140
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
2141
+ //!
2142
+ //! @param[in,out] temp_storage_bytes
2143
+ //! Reference to size in bytes of `d_temp_storage` allocation
2144
+ //!
2145
+ //! @param[in] d_keys_in
2146
+ //! Random-access input iterator to the input sequence of key items
2147
+ //!
2148
+ //! @param[in] d_values_in
2149
+ //! Random-access input iterator to the input sequence of value items
2150
+ //!
2151
+ //! @param[out] d_values_out
2152
+ //! Random-access output iterator to the output sequence of value items
2153
+ //!
2154
+ //! @param[in] scan_op
2155
+ //! Binary associative scan functor
2156
+ //!
2157
+ //! @param[in] num_items
2158
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
2159
+ //!
2160
+ //! @param[in] equality_op
2161
+ //! Binary functor that defines the equality of keys.
2162
+ //! Default is cuda::std::equal_to<>{}.
2163
+ //!
2164
+ //! @param[in] stream
2165
+ //! @rst
2166
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2167
+ //! @endrst
2168
+ template <typename KeysInputIteratorT,
2169
+ typename ValuesInputIteratorT,
2170
+ typename ValuesOutputIteratorT,
2171
+ typename ScanOpT,
2172
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2173
+ typename NumItemsT = uint32_t>
2174
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
2175
+ void* d_temp_storage,
2176
+ size_t& temp_storage_bytes,
2177
+ KeysInputIteratorT d_keys_in,
2178
+ ValuesInputIteratorT d_values_in,
2179
+ ValuesOutputIteratorT d_values_out,
2180
+ ScanOpT scan_op,
2181
+ NumItemsT num_items,
2182
+ EqualityOpT equality_op = EqualityOpT(),
2183
+ cudaStream_t stream = 0)
2184
+ {
2185
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
2186
+
2187
+ // Unsigned integer type for global offsets
2188
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2189
+
2190
+ return DispatchScanByKey<
2191
+ KeysInputIteratorT,
2192
+ ValuesInputIteratorT,
2193
+ ValuesOutputIteratorT,
2194
+ EqualityOpT,
2195
+ ScanOpT,
2196
+ NullType,
2197
+ OffsetT>::Dispatch(d_temp_storage,
2198
+ temp_storage_bytes,
2199
+ d_keys_in,
2200
+ d_values_in,
2201
+ d_values_out,
2202
+ equality_op,
2203
+ scan_op,
2204
+ NullType(),
2205
+ num_items,
2206
+ stream);
2207
+ }
2208
+
2209
+ //! @} end member group
2210
+ };
2211
+
2212
+ CUB_NAMESPACE_END