cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2187 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
7
+ //! items residing within device-accessible memory.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/detail/choose_offset.cuh>
22
+ #include <cub/detail/device_memory_resource.cuh>
23
+ #include <cub/detail/temporary_storage.cuh>
24
+ #include <cub/device/dispatch/dispatch_scan.cuh>
25
+ #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
26
+ #include <cub/thread/thread_operators.cuh>
27
+
28
+ #include <cuda/__execution/determinism.h>
29
+ #include <cuda/__execution/require.h>
30
+ #include <cuda/__execution/tune.h>
31
+ #include <cuda/__memory_resource/get_memory_resource.h>
32
+ #include <cuda/__stream/get_stream.h>
33
+ #include <cuda/std/__execution/env.h>
34
+ #include <cuda/std/__functional/invoke.h>
35
+
36
+ CUB_NAMESPACE_BEGIN
37
+
38
+ namespace detail::scan
39
+ {
40
+ struct get_tuning_query_t
41
+ {};
42
+
43
+ template <class Derived>
44
+ struct tuning
45
+ {
46
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr Derived query(const get_tuning_query_t&) const noexcept
47
+ {
48
+ return static_cast<const Derived&>(*this);
49
+ }
50
+ };
51
+
52
+ struct default_tuning : tuning<default_tuning>
53
+ {
54
+ template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
55
+ using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
56
+ };
57
+ } // namespace detail::scan
58
+
59
+ //! @rst
60
+ //! DeviceScan provides device-wide, parallel operations for computing a
61
+ //! prefix scan across a sequence of data items residing within
62
+ //! device-accessible memory.
63
+ //!
64
+ //! Overview
65
+ //! +++++++++++++++++++++++++++++++++++++++++++++
66
+ //!
67
+ //! Given a sequence of input elements and a binary reduction operator, a
68
+ //! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
69
+ //! sequence where each element is computed to be the reduction of the elements
70
+ //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
71
+ //! with the addition operator. The term *inclusive* indicates that the
72
+ //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
73
+ //! The term *exclusive* indicates the *i*\ :sup:`th` input is not
74
+ //! incorporated into the *i*\ :sup:`th` output reduction. When the input and
75
+ //! output sequences are the same, the scan is performed in-place.
76
+ //!
77
+ //! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
78
+ //! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
79
+ //!
80
+ //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
81
+ //! *"decoupled look-back"* algorithm for performing global prefix scan with
82
+ //! only a single pass through the input data, as described in our 2016 technical
83
+ //! report [1]_. The central idea is to leverage a small, constant factor of
84
+ //! redundant work in order to overlap the latencies of global prefix
85
+ //! propagation with local computation. As such, our algorithm requires only
86
+ //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
87
+ //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
88
+ //!
89
+ //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
90
+ //! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
91
+ //! *NVIDIA Technical Report NVR-2016-002*, 2016.
92
+ //!
93
+ //! Usage Considerations
94
+ //! +++++++++++++++++++++++++++++++++++++++++++++
95
+ //!
96
+ //! @cdp_class{DeviceScan}
97
+ //!
98
+ //! Performance
99
+ //! +++++++++++++++++++++++++++++++++++++++++++++
100
+ //!
101
+ //! @linear_performance{prefix scan}
102
+ //!
103
+ //! @endrst
104
+ struct DeviceScan
105
+ {
106
+ //! @cond
107
+ template <typename TuningEnvT,
108
+ typename InputIteratorT,
109
+ typename OutputIteratorT,
110
+ typename ScanOpT,
111
+ typename InitValueT,
112
+ typename NumItemsT,
113
+ ::cuda::execution::determinism::__determinism_t Determinism,
114
+ ForceInclusive EnforceInclusive = ForceInclusive::No>
115
+ CUB_RUNTIME_FUNCTION static cudaError_t scan_impl_determinism(
116
+ void* d_temp_storage,
117
+ size_t& temp_storage_bytes,
118
+ InputIteratorT d_in,
119
+ OutputIteratorT d_out,
120
+ ScanOpT scan_op,
121
+ InitValueT init,
122
+ NumItemsT num_items,
123
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
124
+ cudaStream_t stream)
125
+ {
126
+ using scan_tuning_t = ::cuda::std::execution::
127
+ __query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
128
+
129
+ // Unsigned integer type for global offsets
130
+ using offset_t = detail::choose_offset_t<NumItemsT>;
131
+
132
+ using accum_t =
133
+ ::cuda::std::__accumulator_t<ScanOpT,
134
+ cub::detail::it_value_t<InputIteratorT>,
135
+ ::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
136
+ cub::detail::it_value_t<InputIteratorT>,
137
+ typename InitValueT::value_type>>;
138
+
139
+ using policy_t = typename scan_tuning_t::
140
+ template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
141
+
142
+ using dispatch_t =
143
+ DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
144
+
145
+ return dispatch_t::Dispatch(
146
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
147
+ }
148
+ //! @endcond
149
+
150
+ //! @cond
151
+ template <typename InputIteratorT,
152
+ typename OutputIteratorT,
153
+ typename ScanOpT,
154
+ typename InitValueT,
155
+ typename NumItemsT,
156
+ ForceInclusive EnforceInclusive = ForceInclusive::No,
157
+ typename EnvT>
158
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_impl_env(
159
+ InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init, NumItemsT num_items, EnvT env)
160
+ {
161
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
162
+ "Determinism should be used inside requires to have an effect.");
163
+
164
+ using requirements_t =
165
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
166
+
167
+ using requested_determinism_t =
168
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
169
+ _CUDA_EXEC::determinism::__get_determinism_t,
170
+ _CUDA_EXEC::determinism::run_to_run_t>;
171
+
172
+ // Static assert to reject gpu_to_gpu determinism since it's not implemented
173
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
174
+ "gpu_to_gpu determinism is not supported");
175
+
176
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::not_guaranteed_t>,
177
+ "not_guaranteed determinism is not supported");
178
+
179
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
180
+
181
+ // Query relevant properties from the environment
182
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
183
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
184
+
185
+ void* d_temp_storage = nullptr;
186
+ size_t temp_storage_bytes = 0;
187
+
188
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
189
+
190
+ // Query the required temporary storage size
191
+ cudaError_t error = scan_impl_determinism<tuning_t>(
192
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
193
+
194
+ if (error != cudaSuccess)
195
+ {
196
+ return error;
197
+ }
198
+
199
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
200
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
201
+ if (error != cudaSuccess)
202
+ {
203
+ return error;
204
+ }
205
+
206
+ // Run the algorithm
207
+ error = scan_impl_determinism<tuning_t>(
208
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
209
+
210
+ // Try to deallocate regardless of the error to avoid memory leaks
211
+ cudaError_t deallocate_error =
212
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
213
+
214
+ if (error != cudaSuccess)
215
+ {
216
+ // Reduction error takes precedence over deallocation error since it happens first
217
+ return error;
218
+ }
219
+
220
+ return deallocate_error;
221
+ }
222
+ //! @endcond
223
+
224
+ //! @name Exclusive scans
225
+ //! @{
226
+
227
+ //! @rst
228
+ //! Computes a device-wide exclusive prefix sum.
229
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
230
+ //!
231
+ //! - Supports non-commutative sum operators.
232
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
233
+ //! addition of floating-point types). Results for pseudo-associative
234
+ //! operators may vary from run to run. Additional details can be found in
235
+ //! the @lookback description.
236
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
237
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
238
+ //! shall not overlap in any other way.
239
+ //! - @devicestorage
240
+ //!
241
+ //! Snippet
242
+ //! +++++++++++++++++++++++++++++++++++++++++++++
243
+ //!
244
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
245
+ //! device vector.
246
+ //!
247
+ //! .. code-block:: c++
248
+ //!
249
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
250
+ //!
251
+ //! // Declare, allocate, and initialize device-accessible pointers for
252
+ //! // input and output
253
+ //! int num_items; // e.g., 7
254
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
255
+ //! int *d_out; // e.g., [ , , , , , , ]
256
+ //! ...
257
+ //!
258
+ //! // Determine temporary device storage requirements
259
+ //! void *d_temp_storage = nullptr;
260
+ //! size_t temp_storage_bytes = 0;
261
+ //! cub::DeviceScan::ExclusiveSum(
262
+ //! d_temp_storage, temp_storage_bytes,
263
+ //! d_in, d_out, num_items);
264
+ //!
265
+ //! // Allocate temporary storage
266
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
267
+ //!
268
+ //! // Run exclusive prefix sum
269
+ //! cub::DeviceScan::ExclusiveSum(
270
+ //! d_temp_storage, temp_storage_bytes,
271
+ //! d_in, d_out, num_items);
272
+ //!
273
+ //! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
274
+ //!
275
+ //! @endrst
276
+ //!
277
+ //! @tparam InputIteratorT
278
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
279
+ //!
280
+ //! @tparam OutputIteratorT
281
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
282
+ //!
283
+ //! @tparam NumItemsT
284
+ //! **[inferred]** An integral type representing the number of input elements
285
+ //!
286
+ //! @param[in] d_temp_storage
287
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
288
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
289
+ //!
290
+ //! @param[in,out] temp_storage_bytes
291
+ //! Reference to size in bytes of `d_temp_storage` allocation
292
+ //!
293
+ //! @param[in] d_in
294
+ //! Random-access iterator to the input sequence of data items
295
+ //!
296
+ //! @param[out] d_out
297
+ //! Random-access iterator to the output sequence of data items
298
+ //!
299
+ //! @param[in] num_items
300
+ //! Total number of input items (i.e., the length of `d_in`)
301
+ //!
302
+ //! @param[in] stream
303
+ //! @rst
304
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
305
+ //! @endrst
306
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
307
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
308
+ void* d_temp_storage,
309
+ size_t& temp_storage_bytes,
310
+ InputIteratorT d_in,
311
+ OutputIteratorT d_out,
312
+ NumItemsT num_items,
313
+ cudaStream_t stream = 0)
314
+ {
315
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
316
+
317
+ // Unsigned integer type for global offsets
318
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
319
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
320
+
321
+ // Initial value
322
+ InitT init_value{};
323
+
324
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
325
+ Dispatch(d_temp_storage,
326
+ temp_storage_bytes,
327
+ d_in,
328
+ d_out,
329
+ ::cuda::std::plus<>{},
330
+ detail::InputValue<InitT>(init_value),
331
+ num_items,
332
+ stream);
333
+ }
334
+
335
+ //! @rst
336
+ //! Computes a device-wide exclusive prefix sum.
337
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
338
+ //!
339
+ //! - Supports non-commutative sum operators.
340
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
341
+ //! addition of floating-point types). Results for pseudo-associative
342
+ //! operators may vary from run to run. Additional details can be found in
343
+ //! the @lookback description.
344
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
345
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
346
+ //! shall not overlap in any other way.
347
+ //! - @devicestorage
348
+ //!
349
+ //! Preconditions
350
+ //! +++++++++++++
351
+ //!
352
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
353
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
354
+ //! shall not overlap in any other way.
355
+ //! - ``d_in`` and ``d_out`` must not be null pointers
356
+ //!
357
+ //! Snippet
358
+ //! +++++++++++++++++++++++++++++++++++++++++++++
359
+ //!
360
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
361
+ //! device vector of ``float`` data elements.
362
+ //!
363
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
364
+ //! :language: c++
365
+ //! :dedent:
366
+ //! :start-after: example-begin exclusive-sum-env-determinism
367
+ //! :end-before: example-end exclusive-sum-env-determinism
368
+ //!
369
+ //! @endrst
370
+ //!
371
+ //! @tparam InputIteratorT
372
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
373
+ //!
374
+ //! @tparam OutputIteratorT
375
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
376
+ //!
377
+ //! @tparam NumItemsT
378
+ //! **[inferred]** An integral type representing the number of input elements
379
+ //!
380
+ //! @tparam EnvT
381
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
382
+ //!
383
+ //! @param[in] d_in
384
+ //! Random-access iterator to the input sequence of data items
385
+ //!
386
+ //! @param[out] d_out
387
+ //! Random-access iterator to the output sequence of data items
388
+ //!
389
+ //! @param[in] num_items
390
+ //! Total number of input items (i.e., the length of `d_in`)
391
+ //!
392
+ //! @param[in] env
393
+ //! @rst
394
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
395
+ //! @endrst
396
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT, typename EnvT = _CUDA_STD_EXEC::env<>>
397
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
398
+ ExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
399
+ {
400
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");
401
+
402
+ using InitT = cub::detail::it_value_t<InputIteratorT>;
403
+
404
+ // Initial value
405
+ InitT init_value{};
406
+
407
+ return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<InitT>(init_value), num_items, env);
408
+ }
409
+
410
+ //! @rst
411
+ //! Computes a device-wide exclusive prefix sum in-place.
412
+ //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
413
+ //!
414
+ //! - Supports non-commutative sum operators.
415
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
416
+ //! addition of floating-point types). Results for pseudo-associative
417
+ //! operators may vary from run to run. Additional details can be found in
418
+ //! the @lookback description.
419
+ //! - @devicestorage
420
+ //!
421
+ //! Snippet
422
+ //! +++++++++++++++++++++++++++++++++++++++++++++
423
+ //!
424
+ //! The code snippet below illustrates the exclusive prefix sum of an ``int``
425
+ //! device vector.
426
+ //!
427
+ //! .. code-block:: c++
428
+ //!
429
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
430
+ //!
431
+ //! // Declare, allocate, and initialize device-accessible pointers for
432
+ //! // input and output
433
+ //! int num_items; // e.g., 7
434
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
435
+ //! ...
436
+ //!
437
+ //! // Determine temporary device storage requirements
438
+ //! void *d_temp_storage = nullptr;
439
+ //! size_t temp_storage_bytes = 0;
440
+ //! cub::DeviceScan::ExclusiveSum(
441
+ //! d_temp_storage, temp_storage_bytes,
442
+ //! d_data, num_items);
443
+ //!
444
+ //! // Allocate temporary storage
445
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
446
+ //!
447
+ //! // Run exclusive prefix sum
448
+ //! cub::DeviceScan::ExclusiveSum(
449
+ //! d_temp_storage, temp_storage_bytes,
450
+ //! d_data, num_items);
451
+ //!
452
+ //! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
453
+ //!
454
+ //! @endrst
455
+ //!
456
+ //! @tparam IteratorT
457
+ //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
458
+ //!
459
+ //! @tparam NumItemsT
460
+ //! **[inferred]** An integral type representing the number of input elements
461
+ //!
462
+ //! @param[in] d_temp_storage
463
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
464
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
465
+ //!
466
+ //! @param[in,out] temp_storage_bytes
467
+ //! Reference to size in bytes of `d_temp_storage` allocation
468
+ //!
469
+ //! @param[in,out] d_data
470
+ //! Random-access iterator to the sequence of data items
471
+ //!
472
+ //! @param[in] num_items
473
+ //! Total number of input items (i.e., the length of `d_in`)
474
+ //!
475
+ //! @param[in] stream
476
+ //! @rst
477
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
478
+ //! @endrst
479
+ template <typename IteratorT, typename NumItemsT>
480
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
481
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
482
+ {
483
+ return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
484
+ }
485
+
486
+ //! @rst
487
+ //! Computes a device-wide exclusive prefix scan using the specified
488
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
489
+ //! the initial value, and is assigned to ``*d_out``.
490
+ //!
491
+ //! - Supports non-commutative scan operators.
492
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
493
+ //! addition of floating-point types). Results for pseudo-associative
494
+ //! operators may vary from run to run. Additional details can be found in
495
+ //! the @lookback description.
496
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
497
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
498
+ //! shall not overlap in any other way.
499
+ //! - @devicestorage
500
+ //!
501
+ //! Snippet
502
+ //! +++++++++++++++++++++++++++++++++++++++++++++
503
+ //!
504
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
505
+ //!
506
+ //! .. code-block:: c++
507
+ //!
508
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
509
+ //! #include <cuda/std/climits> // for INT_MAX
510
+ //!
511
+ //! // CustomMin functor
512
+ //! struct CustomMin
513
+ //! {
514
+ //! template <typename T>
515
+ //! __host__ __device__ __forceinline__
516
+ //! T operator()(const T &a, const T &b) const {
517
+ //! return (b < a) ? b : a;
518
+ //! }
519
+ //! };
520
+ //!
521
+ //! // Declare, allocate, and initialize device-accessible pointers for
522
+ //! // input and output
523
+ //! int num_items; // e.g., 7
524
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
525
+ //! int *d_out; // e.g., [ , , , , , , ]
526
+ //! CustomMin min_op;
527
+ //! ...
528
+ //!
529
+ //! // Determine temporary device storage requirements for exclusive
530
+ //! // prefix scan
531
+ //! void *d_temp_storage = nullptr;
532
+ //! size_t temp_storage_bytes = 0;
533
+ //! cub::DeviceScan::ExclusiveScan(
534
+ //! d_temp_storage, temp_storage_bytes,
535
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
536
+ //!
537
+ //! // Allocate temporary storage for exclusive prefix scan
538
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
539
+ //!
540
+ //! // Run exclusive prefix min-scan
541
+ //! cub::DeviceScan::ExclusiveScan(
542
+ //! d_temp_storage, temp_storage_bytes,
543
+ //! d_in, d_out, min_op, (int) INT_MAX, num_items);
544
+ //!
545
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
546
+ //!
547
+ //! @endrst
548
+ //!
549
+ //! @tparam InputIteratorT
550
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
551
+ //!
552
+ //! @tparam OutputIteratorT
553
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
554
+ //!
555
+ //! @tparam ScanOpT
556
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
557
+ //!
558
+ //! @tparam InitValueT
559
+ //! **[inferred]** Type of the `init_value`
560
+ //!
561
+ //! @tparam NumItemsT
562
+ //! **[inferred]** An integral type representing the number of input elements
563
+ //!
564
+ //! @param[in] d_temp_storage
565
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
566
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
567
+ //!
568
+ //! @param[in,out] temp_storage_bytes
569
+ //! Reference to size in bytes of `d_temp_storage` allocation
570
+ //!
571
+ //! @param[in] d_in
572
+ //! Random-access iterator to the input sequence of data items
573
+ //!
574
+ //! @param[out] d_out
575
+ //! Random-access iterator to the output sequence of data items
576
+ //!
577
+ //! @param[in] scan_op
578
+ //! Binary associative scan functor
579
+ //!
580
+ //! @param[in] init_value
581
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
582
+ //!
583
+ //! @param[in] num_items
584
+ //! Total number of input items (i.e., the length of `d_in`)
585
+ //!
586
+ //! @param[in] stream
587
+ //! @rst
588
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
589
+ //! @endrst
590
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
591
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
592
+ void* d_temp_storage,
593
+ size_t& temp_storage_bytes,
594
+ InputIteratorT d_in,
595
+ OutputIteratorT d_out,
596
+ ScanOpT scan_op,
597
+ InitValueT init_value,
598
+ NumItemsT num_items,
599
+ cudaStream_t stream = 0)
600
+ {
601
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
602
+
603
+ // Unsigned integer type for global offsets
604
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
605
+
606
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
607
+ d_temp_storage,
608
+ temp_storage_bytes,
609
+ d_in,
610
+ d_out,
611
+ scan_op,
612
+ detail::InputValue<InitValueT>(init_value),
613
+ num_items,
614
+ stream);
615
+ }
616
+
617
+ //! @rst
618
+ //! Computes a device-wide exclusive prefix scan using the specified
619
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
620
+ //! the initial value, and is assigned to ``*d_out``.
621
+ //!
622
+ //! - Supports non-commutative scan operators.
623
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
624
+ //! addition of floating-point types). Results for pseudo-associative
625
+ //! operators may vary from run to run. Additional details can be found in
626
+ //! the @lookback description.
627
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
628
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
629
+ //! shall not overlap in any other way.
630
+ //! - @devicestorage
631
+ //!
632
+ //! Snippet
633
+ //! +++++++++++++++++++++++++++++++++++++++++++++
634
+ //!
635
+ //! The code snippet below illustrates a user-defined exclusive-scan of a
636
+ //! device vector of ``float`` data elements.
637
+ //!
638
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
639
+ //! :language: c++
640
+ //! :dedent:
641
+ //! :start-after: example-begin exclusive-scan-env-determinism
642
+ //! :end-before: example-end exclusive-scan-env-determinism
643
+ //!
644
+ //! @endrst
645
+ //!
646
+ //! @tparam InputIteratorT
647
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
648
+ //!
649
+ //! @tparam OutputIteratorT
650
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
651
+ //!
652
+ //! @tparam ScanOpT
653
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
654
+ //!
655
+ //! @tparam InitValueT
656
+ //! **[inferred]** Type of the `init_value`
657
+ //!
658
+ //! @tparam NumItemsT
659
+ //! **[inferred]** An integral type representing the number of input elements
660
+ //!
661
+ //! @tparam EnvT
662
+ //! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
663
+ //!
664
+ //! @param[in] d_in
665
+ //! Random-access iterator to the input sequence of data items
666
+ //!
667
+ //! @param[out] d_out
668
+ //! Random-access iterator to the output sequence of data items
669
+ //!
670
+ //! @param[in] scan_op
671
+ //! Binary associative scan functor
672
+ //!
673
+ //! @param[in] init_value
674
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
675
+ //!
676
+ //! @param[in] num_items
677
+ //! Total number of input items (i.e., the length of `d_in`)
678
+ //!
679
+ //! @param[in] env
680
+ //! @rst
681
+ //! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
682
+ //! @endrst
683
+ template <typename InputIteratorT,
684
+ typename OutputIteratorT,
685
+ typename ScanOpT,
686
+ typename InitValueT,
687
+ typename NumItemsT,
688
+ typename EnvT = _CUDA_STD_EXEC::env<>>
689
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
690
+ InputIteratorT d_in,
691
+ OutputIteratorT d_out,
692
+ ScanOpT scan_op,
693
+ InitValueT init_value,
694
+ NumItemsT num_items,
695
+ EnvT env = {})
696
+ {
697
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScan");
698
+
699
+ return scan_impl_env(d_in, d_out, scan_op, detail::InputValue<InitValueT>(init_value), num_items, env);
700
+ }
701
+
702
+ //! @rst
703
+ //! Computes a device-wide exclusive prefix scan using the specified
704
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
705
+ //! the initial value, and is assigned to ``*d_data``.
706
+ //!
707
+ //! - Supports non-commutative scan operators.
708
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
709
+ //! addition of floating-point types). Results for pseudo-associative
710
+ //! operators may vary from run to run. Additional details can be found in
711
+ //! the @lookback description.
712
+ //! - @devicestorage
713
+ //!
714
+ //! Snippet
715
+ //! +++++++++++++++++++++++++++++++++++++++++++++
716
+ //!
717
+ //! The code snippet below illustrates the exclusive prefix min-scan of an
718
+ //! ``int`` device vector:
719
+ //!
720
+ //! .. code-block:: c++
721
+ //!
722
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
723
+ //! #include <cuda/std/climits> // for INT_MAX
724
+ //!
725
+ //! // CustomMin functor
726
+ //! struct CustomMin
727
+ //! {
728
+ //! template <typename T>
729
+ //! __host__ __device__ __forceinline__
730
+ //! T operator()(const T &a, const T &b) const {
731
+ //! return (b < a) ? b : a;
732
+ //! }
733
+ //! };
734
+ //!
735
+ //! // Declare, allocate, and initialize device-accessible pointers for
736
+ //! // input and output
737
+ //! int num_items; // e.g., 7
738
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
739
+ //! CustomMin min_op;
740
+ //! ...
741
+ //!
742
+ //! // Determine temporary device storage requirements for exclusive
743
+ //! // prefix scan
744
+ //! void *d_temp_storage = nullptr;
745
+ //! size_t temp_storage_bytes = 0;
746
+ //! cub::DeviceScan::ExclusiveScan(
747
+ //! d_temp_storage, temp_storage_bytes,
748
+ //! d_data, min_op, (int) INT_MAX, num_items);
749
+ //!
750
+ //! // Allocate temporary storage for exclusive prefix scan
751
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
752
+ //!
753
+ //! // Run exclusive prefix min-scan
754
+ //! cub::DeviceScan::ExclusiveScan(
755
+ //! d_temp_storage, temp_storage_bytes,
756
+ //! d_data, min_op, (int) INT_MAX, num_items);
757
+ //!
758
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
759
+ //!
760
+ //! @endrst
761
+ //!
762
+ //! @tparam IteratorT
763
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
764
+ //!
765
+ //! @tparam ScanOpT
766
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
767
+ //!
768
+ //! @tparam InitValueT
769
+ //! **[inferred]** Type of the `init_value`
770
+ //!
771
+ //! @tparam NumItemsT
772
+ //! **[inferred]** An integral type representing the number of input elements
773
+ //!
774
+ //! @param[in] d_temp_storage
775
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
776
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
777
+ //!
778
+ //! @param[in,out] temp_storage_bytes
779
+ //! Reference to size in bytes of `d_temp_storage` allocation
780
+ //!
781
+ //! @param[in,out] d_data
782
+ //! Random-access iterator to the sequence of data items
783
+ //!
784
+ //! @param[in] scan_op
785
+ //! Binary associative scan functor
786
+ //!
787
+ //! @param[in] init_value
788
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
789
+ //!
790
+ //! @param[in] num_items
791
+ //! Total number of input items (i.e., the length of `d_in`)
792
+ //!
793
+ //! @param[in] stream
794
+ //! @rst
795
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
796
+ //! @endrst
797
+ template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
798
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
799
+ void* d_temp_storage,
800
+ size_t& temp_storage_bytes,
801
+ IteratorT d_data,
802
+ ScanOpT scan_op,
803
+ InitValueT init_value,
804
+ NumItemsT num_items,
805
+ cudaStream_t stream = 0)
806
+ {
807
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
808
+ }
809
+
810
+ //! @rst
811
+ //! Computes a device-wide exclusive prefix scan using the specified
812
+ //! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
813
+ //!
814
+ //! - Supports non-commutative scan operators.
815
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
816
+ //! addition of floating-point types). Results for pseudo-associative
817
+ //! operators may vary from run to run. Additional details can be found in
818
+ //! the @lookback description.
819
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
820
+ //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
821
+ //! shall not overlap in any other way.
822
+ //! - @devicestorage
823
+ //!
824
+ //! Snippet
825
+ //! +++++++++++++++++++++++++++++++++++++++++++++
826
+ //!
827
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
828
+ //!
829
+ //! .. code-block:: c++
830
+ //!
831
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
832
+ //! #include <cuda/std/climits> // for INT_MAX
833
+ //!
834
+ //! // CustomMin functor
835
+ //! struct CustomMin
836
+ //! {
837
+ //! template <typename T>
838
+ //! __host__ __device__ __forceinline__
839
+ //! T operator()(const T &a, const T &b) const {
840
+ //! return (b < a) ? b : a;
841
+ //! }
842
+ //! };
843
+ //!
844
+ //! // Declare, allocate, and initialize device-accessible pointers for
845
+ //! // input and output
846
+ //! int num_items; // e.g., 7
847
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
848
+ //! int *d_out; // e.g., [ , , , , , , ]
849
+ //! int *d_init_iter; // e.g., INT_MAX
850
+ //! CustomMin min_op;
851
+ //!
852
+ //! auto future_init_value =
853
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
854
+ //!
855
+ //! ...
856
+ //!
857
+ //! // Determine temporary device storage requirements for exclusive
858
+ //! // prefix scan
859
+ //! void *d_temp_storage = nullptr;
860
+ //! size_t temp_storage_bytes = 0;
861
+ //! cub::DeviceScan::ExclusiveScan(
862
+ //! d_temp_storage, temp_storage_bytes,
863
+ //! d_in, d_out, min_op, future_init_value, num_items);
864
+ //!
865
+ //! // Allocate temporary storage for exclusive prefix scan
866
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
867
+ //!
868
+ //! // Run exclusive prefix min-scan
869
+ //! cub::DeviceScan::ExclusiveScan(
870
+ //! d_temp_storage, temp_storage_bytes,
871
+ //! d_in, d_out, min_op, future_init_value, num_items);
872
+ //!
873
+ //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
874
+ //!
875
+ //! @endrst
876
+ //!
877
+ //! @tparam InputIteratorT
878
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
879
+ //!
880
+ //! @tparam OutputIteratorT
881
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
882
+ //!
883
+ //! @tparam ScanOpT
884
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
885
+ //!
886
+ //! @tparam InitValueT
887
+ //! **[inferred]** Type of the `init_value`
888
+ //!
889
+ //! @tparam NumItemsT
890
+ //! **[inferred]** An integral type representing the number of input elements
891
+ //!
892
+ //! @param[in] d_temp_storage
893
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
894
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
895
+ //!
896
+ //! @param[in,out] temp_storage_bytes
897
+ //! Reference to size in bytes of `d_temp_storage` allocation
898
+ //!
899
+ //! @param[in] d_in
900
+ //! Pointer to the input sequence of data items
901
+ //!
902
+ //! @param[out] d_out
903
+ //! Pointer to the output sequence of data items
904
+ //!
905
+ //! @param[in] scan_op
906
+ //! Binary associative scan functor
907
+ //!
908
+ //! @param[in] init_value
909
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
910
+ //!
911
+ //! @param[in] num_items
912
+ //! Total number of input items (i.e., the length of `d_in`)
913
+ //!
914
+ //! @param[in] stream
915
+ //! @rst
916
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
917
+ //! @endrst
918
+ template <typename InputIteratorT,
919
+ typename OutputIteratorT,
920
+ typename ScanOpT,
921
+ typename InitValueT,
922
+ typename InitValueIterT = InitValueT*,
923
+ typename NumItemsT = int>
924
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
925
+ void* d_temp_storage,
926
+ size_t& temp_storage_bytes,
927
+ InputIteratorT d_in,
928
+ OutputIteratorT d_out,
929
+ ScanOpT scan_op,
930
+ FutureValue<InitValueT, InitValueIterT> init_value,
931
+ NumItemsT num_items,
932
+ cudaStream_t stream = 0)
933
+ {
934
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
935
+
936
+ // Unsigned integer type for global offsets
937
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
938
+
939
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
940
+ d_temp_storage,
941
+ temp_storage_bytes,
942
+ d_in,
943
+ d_out,
944
+ scan_op,
945
+ detail::InputValue<InitValueT>(init_value),
946
+ num_items,
947
+ stream);
948
+ }
949
+
950
+ //! @rst
951
+ //! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
952
+ //! The ``init_value`` value is provided as a future value.
953
+ //!
954
+ //! - Supports non-commutative scan operators.
955
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
956
+ //! addition of floating-point types). Results for pseudo-associative
957
+ //! operators may vary from run to run. Additional details can be found in
958
+ //! the @lookback description.
959
+ //! - @devicestorage
960
+ //!
961
+ //! Snippet
962
+ //! +++++++++++++++++++++++++++++++++++++++++++++
963
+ //!
964
+ //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
965
+ //!
966
+ //! .. code-block:: c++
967
+ //!
968
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
969
+ //! #include <cuda/std/climits> // for INT_MAX
970
+ //!
971
+ //! // CustomMin functor
972
+ //! struct CustomMin
973
+ //! {
974
+ //! template <typename T>
975
+ //! __host__ __device__ __forceinline__
976
+ //! T operator()(const T &a, const T &b) const {
977
+ //! return (b < a) ? b : a;
978
+ //! }
979
+ //! };
980
+ //!
981
+ //! // Declare, allocate, and initialize device-accessible pointers for
982
+ //! // input and output
983
+ //! int num_items; // e.g., 7
984
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
985
+ //! int *d_init_iter; // e.g., INT_MAX
986
+ //! CustomMin min_op;
987
+ //!
988
+ //! auto future_init_value =
989
+ //! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
990
+ //!
991
+ //! ...
992
+ //!
993
+ //! // Determine temporary device storage requirements for exclusive
994
+ //! // prefix scan
995
+ //! void *d_temp_storage = nullptr;
996
+ //! size_t temp_storage_bytes = 0;
997
+ //! cub::DeviceScan::ExclusiveScan(
998
+ //! d_temp_storage, temp_storage_bytes,
999
+ //! d_data, min_op, future_init_value, num_items);
1000
+ //!
1001
+ //! // Allocate temporary storage for exclusive prefix scan
1002
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1003
+ //!
1004
+ //! // Run exclusive prefix min-scan
1005
+ //! cub::DeviceScan::ExclusiveScan(
1006
+ //! d_temp_storage, temp_storage_bytes,
1007
+ //! d_data, min_op, future_init_value, num_items);
1008
+ //!
1009
+ //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
1010
+ //!
1011
+ //! @endrst
1012
+ //!
1013
+ //! @tparam IteratorT
1014
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1015
+ //!
1016
+ //! @tparam ScanOpT
1017
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1018
+ //!
1019
+ //! @tparam InitValueT
1020
+ //! **[inferred]** Type of the `init_value`
1021
+ //!
1022
+ //! @tparam NumItemsT
1023
+ //! **[inferred]** An integral type representing the number of input elements
1024
+ //!
1025
+ //! @param[in] d_temp_storage
1026
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1027
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1028
+ //!
1029
+ //! @param[in,out] temp_storage_bytes
1030
+ //! Reference to size in bytes of `d_temp_storage` allocation
1031
+ //!
1032
+ //! @param[in,out] d_data
1033
+ //! Pointer to the sequence of data items
1034
+ //!
1035
+ //! @param[in] scan_op
1036
+ //! Binary associative scan functor
1037
+ //!
1038
+ //! @param[in] init_value
1039
+ //! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
1040
+ //!
1041
+ //! @param[in] num_items
1042
+ //! Total number of input items (i.e., the length of `d_in`)
1043
+ //!
1044
+ //! @param[in] stream
1045
+ //! @rst
1046
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1047
+ //! @endrst
1048
+ template <typename IteratorT,
1049
+ typename ScanOpT,
1050
+ typename InitValueT,
1051
+ typename InitValueIterT = InitValueT*,
1052
+ typename NumItemsT = int>
1053
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
1054
+ void* d_temp_storage,
1055
+ size_t& temp_storage_bytes,
1056
+ IteratorT d_data,
1057
+ ScanOpT scan_op,
1058
+ FutureValue<InitValueT, InitValueIterT> init_value,
1059
+ NumItemsT num_items,
1060
+ cudaStream_t stream = 0)
1061
+ {
1062
+ return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
1063
+ }
1064
+
1065
+ //! @} end member group
1066
+
1067
+ //! @name Inclusive scans
1068
+ //! @{
1069
+
1070
+ //! @rst
1071
+ //! Computes a device-wide inclusive prefix sum.
1072
+ //!
1073
+ //! - Supports non-commutative sum operators.
1074
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1075
+ //! addition of floating-point types). Results for pseudo-associative
1076
+ //! operators may vary from run to run. Additional details can be found in
1077
+ //! the @lookback description.
1078
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1079
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1080
+ //! shall not overlap in any other way.
1081
+ //! - @devicestorage
1082
+ //!
1083
+ //! Snippet
1084
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1085
+ //!
1086
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1087
+ //!
1088
+ //! .. code-block:: c++
1089
+ //!
1090
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1091
+ //!
1092
+ //! // Declare, allocate, and initialize device-accessible pointers for
1093
+ //! // input and output
1094
+ //! int num_items; // e.g., 7
1095
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1096
+ //! int *d_out; // e.g., [ , , , , , , ]
1097
+ //! ...
1098
+ //!
1099
+ //! // Determine temporary device storage requirements for inclusive
1100
+ //! // prefix sum
1101
+ //! void *d_temp_storage = nullptr;
1102
+ //! size_t temp_storage_bytes = 0;
1103
+ //! cub::DeviceScan::InclusiveSum(
1104
+ //! d_temp_storage, temp_storage_bytes,
1105
+ //! d_in, d_out, num_items);
1106
+ //!
1107
+ //! // Allocate temporary storage for inclusive prefix sum
1108
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1109
+ //!
1110
+ //! // Run inclusive prefix sum
1111
+ //! cub::DeviceScan::InclusiveSum(
1112
+ //! d_temp_storage, temp_storage_bytes,
1113
+ //! d_in, d_out, num_items);
1114
+ //!
1115
+ //! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
1116
+ //!
1117
+ //! @endrst
1118
+ //!
1119
+ //! @tparam InputIteratorT
1120
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1121
+ //!
1122
+ //! @tparam OutputIteratorT
1123
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1124
+ //!
1125
+ //! @tparam NumItemsT
1126
+ //! **[inferred]** An integral type representing the number of input elements
1127
+ //!
1128
+ //! @param[in] d_temp_storage
1129
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1130
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1131
+ //!
1132
+ //! @param[in,out] temp_storage_bytes
1133
+ //! Reference to size in bytes of `d_temp_storage` allocation
1134
+ //!
1135
+ //! @param[in] d_in
1136
+ //! Random-access iterator to the input sequence of data items
1137
+ //!
1138
+ //! @param[out] d_out
1139
+ //! Random-access iterator to the output sequence of data items
1140
+ //!
1141
+ //! @param[in] num_items
1142
+ //! Total number of input items (i.e., the length of `d_in`)
1143
+ //!
1144
+ //! @param[in] stream
1145
+ //! @rst
1146
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1147
+ //! @endrst
1148
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1149
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1150
+ void* d_temp_storage,
1151
+ size_t& temp_storage_bytes,
1152
+ InputIteratorT d_in,
1153
+ OutputIteratorT d_out,
1154
+ NumItemsT num_items,
1155
+ cudaStream_t stream = 0)
1156
+ {
1157
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
1158
+
1159
+ // Unsigned integer type for global offsets
1160
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1161
+
1162
+ return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
1163
+ d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
1164
+ }
1165
+
1166
+ //! @rst
1167
+ //! Computes a device-wide inclusive prefix sum in-place.
1168
+ //!
1169
+ //! - Supports non-commutative sum operators.
1170
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1171
+ //! addition of floating-point types). Results for pseudo-associative
1172
+ //! operators may vary from run to run. Additional details can be found in
1173
+ //! the @lookback description.
1174
+ //! - @devicestorage
1175
+ //!
1176
+ //! Snippet
1177
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1178
+ //!
1179
+ //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
1180
+ //!
1181
+ //! .. code-block:: c++
1182
+ //!
1183
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1184
+ //!
1185
+ //! // Declare, allocate, and initialize device-accessible pointers for
1186
+ //! // input and output
1187
+ //! int num_items; // e.g., 7
1188
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1189
+ //! ...
1190
+ //!
1191
+ //! // Determine temporary device storage requirements for inclusive
1192
+ //! // prefix sum
1193
+ //! void *d_temp_storage = nullptr;
1194
+ //! size_t temp_storage_bytes = 0;
1195
+ //! cub::DeviceScan::InclusiveSum(
1196
+ //! d_temp_storage, temp_storage_bytes,
1197
+ //! d_data, num_items);
1198
+ //!
1199
+ //! // Allocate temporary storage for inclusive prefix sum
1200
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1201
+ //!
1202
+ //! // Run inclusive prefix sum
1203
+ //! cub::DeviceScan::InclusiveSum(
1204
+ //! d_temp_storage, temp_storage_bytes,
1205
+ //! d_data, num_items);
1206
+ //!
1207
+ //! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
1208
+ //!
1209
+ //! @endrst
1210
+ //!
1211
+ //! @tparam IteratorT
1212
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1213
+ //!
1214
+ //! @tparam NumItemsT
1215
+ //! **[inferred]** An integral type representing the number of input elements
1216
+ //!
1217
+ //! @param[in] d_temp_storage
1218
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1219
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1220
+ //!
1221
+ //! @param[in,out] temp_storage_bytes
1222
+ //! Reference to size in bytes of `d_temp_storage` allocation
1223
+ //!
1224
+ //! @param[in,out] d_data
1225
+ //! Random-access iterator to the sequence of data items
1226
+ //!
1227
+ //! @param[in] num_items
1228
+ //! Total number of input items (i.e., the length of `d_in`)
1229
+ //!
1230
+ //! @param[in] stream
1231
+ //! @rst
1232
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1233
+ //! @endrst
1234
+ template <typename IteratorT, typename NumItemsT>
1235
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
1236
+ void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
1237
+ {
1238
+ return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
1239
+ }
1240
+
1241
+ //! @rst
1242
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1243
+ //!
1244
+ //! - Supports non-commutative scan operators.
1245
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1246
+ //! addition of floating-point types). Results for pseudo-associative
1247
+ //! operators may vary from run to run. Additional details can be found in
1248
+ //! the @lookback description.
1249
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1250
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1251
+ //! shall not overlap in any other way.
1252
+ //! - @devicestorage
1253
+ //!
1254
+ //! Snippet
1255
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1256
+ //!
1257
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1258
+ //!
1259
+ //! .. code-block:: c++
1260
+ //!
1261
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1262
+ //! #include <cuda/std/climits> // for INT_MAX
1263
+ //!
1264
+ //! // CustomMin functor
1265
+ //! struct CustomMin
1266
+ //! {
1267
+ //! template <typename T>
1268
+ //! __host__ __device__ __forceinline__
1269
+ //! T operator()(const T &a, const T &b) const {
1270
+ //! return (b < a) ? b : a;
1271
+ //! }
1272
+ //! };
1273
+ //!
1274
+ //! // Declare, allocate, and initialize device-accessible pointers for
1275
+ //! // input and output
1276
+ //! int num_items; // e.g., 7
1277
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1278
+ //! int *d_out; // e.g., [ , , , , , , ]
1279
+ //! CustomMin min_op;
1280
+ //! ...
1281
+ //!
1282
+ //! // Determine temporary device storage requirements for inclusive
1283
+ //! // prefix scan
1284
+ //! void *d_temp_storage = nullptr;
1285
+ //! size_t temp_storage_bytes = 0;
1286
+ //! cub::DeviceScan::InclusiveScan(
1287
+ //! d_temp_storage, temp_storage_bytes,
1288
+ //! d_in, d_out, min_op, num_items);
1289
+ //!
1290
+ //! // Allocate temporary storage for inclusive prefix scan
1291
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1292
+ //!
1293
+ //! // Run inclusive prefix min-scan
1294
+ //! cub::DeviceScan::InclusiveScan(
1295
+ //! d_temp_storage, temp_storage_bytes,
1296
+ //! d_in, d_out, min_op, num_items);
1297
+ //!
1298
+ //! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
1299
+ //!
1300
+ //! @endrst
1301
+ //!
1302
+ //! @tparam InputIteratorT
1303
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1304
+ //!
1305
+ //! @tparam OutputIteratorT
1306
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1307
+ //!
1308
+ //! @tparam ScanOpT
1309
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1310
+ //!
1311
+ //! @tparam NumItemsT
1312
+ //! **[inferred]** An integral type representing the number of input elements
1313
+ //!
1314
+ //! @param[in]
1315
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1316
+ //! When `nullptr`, the required allocation size is written to
1317
+ //! `temp_storage_bytes` and no work is done.
1318
+ //!
1319
+ //! @param[in,out] temp_storage_bytes
1320
+ //! Reference to size in bytes of `d_temp_storage` allocation
1321
+ //!
1322
+ //! @param[in] d_in
1323
+ //! Random-access iterator to the input sequence of data items
1324
+ //!
1325
+ //! @param[out] d_out
1326
+ //! Random-access iterator to the output sequence of data items
1327
+ //!
1328
+ //! @param[in] scan_op
1329
+ //! Binary associative scan functor
1330
+ //!
1331
+ //! @param[in] num_items
1332
+ //! Total number of input items (i.e., the length of `d_in`)
1333
+ //!
1334
+ //! @param[in] stream
1335
+ //! @rst
1336
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1337
+ //! @endrst
1338
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
1339
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1340
+ void* d_temp_storage,
1341
+ size_t& temp_storage_bytes,
1342
+ InputIteratorT d_in,
1343
+ OutputIteratorT d_out,
1344
+ ScanOpT scan_op,
1345
+ NumItemsT num_items,
1346
+ cudaStream_t stream = 0)
1347
+ {
1348
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
1349
+
1350
+ // Unsigned integer type for global offsets
1351
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1352
+
1353
+ return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
1354
+ d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
1355
+ }
1356
+
1357
+ //! @rst
1358
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1359
+ //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
1360
+ //! is assigned to ``*d_out``.
1361
+ //!
1362
+ //! - Supports non-commutative scan operators.
1363
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1364
+ //! addition of floating-point types). Results for pseudo-associative
1365
+ //! operators may vary from run to run. Additional details can be found in
1366
+ //! the @lookback description.
1367
+ //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
1368
+ //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
1369
+ //! shall not overlap in any other way.
1370
+ //! - @devicestorage
1371
+ //!
1372
+ //! Snippet
1373
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1374
+ //!
1375
+ //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
1376
+ //!
1377
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
1378
+ //! :language: c++
1379
+ //! :dedent:
1380
+ //! :start-after: example-begin device-inclusive-scan
1381
+ //! :end-before: example-end device-inclusive-scan
1382
+ //!
1383
+ //! @endrst
1384
+ //!
1385
+ //! @tparam InputIteratorT
1386
+ //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
1387
+ //!
1388
+ //! @tparam OutputIteratorT
1389
+ //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
1390
+ //!
1391
+ //! @tparam ScanOpT
1392
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1393
+ //!
1394
+ //! @tparam InitValueT
1395
+ //! **[inferred]** Type of the `init_value`
1396
+ //!
1397
+ //! @tparam NumItemsT
1398
+ //! **[inferred]** An integral type representing the number of input elements
1399
+ //!
1400
+ //! @param[in] d_temp_storage
1401
+ //! Device-accessible allocation of temporary storage.
1402
+ //! When `nullptr`, the required allocation size is written to
1403
+ //! `temp_storage_bytes` and no work is done.
1404
+ //!
1405
+ //! @param[in,out] temp_storage_bytes
1406
+ //! Reference to the size in bytes of the `d_temp_storage` allocation
1407
+ //!
1408
+ //! @param[in] d_in
1409
+ //! Random-access iterator to the input sequence of data items
1410
+ //!
1411
+ //! @param[out] d_out
1412
+ //! Random-access iterator to the output sequence of data items
1413
+ //!
1414
+ //! @param[in] scan_op
1415
+ //! Binary associative scan functor
1416
+ //!
1417
+ //! @param[in] init_value
1418
+ //! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
1419
+ //! is assigned to `*d_out`)
1420
+ //!
1421
+ //! @param[in] num_items
1422
+ //! Total number of input items (i.e., the length of `d_in`)
1423
+ //!
1424
+ //! @param[in] stream
1425
+ //! CUDA stream to launch kernels within.
1426
+ template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
1427
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
1428
+ void* d_temp_storage,
1429
+ size_t& temp_storage_bytes,
1430
+ InputIteratorT d_in,
1431
+ OutputIteratorT d_out,
1432
+ ScanOpT scan_op,
1433
+ InitValueT init_value,
1434
+ NumItemsT num_items,
1435
+ cudaStream_t stream = 0)
1436
+ {
1437
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
1438
+
1439
+ // Unsigned integer type for global offsets
1440
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1441
+ using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
1442
+
1443
+ return DispatchScan<
1444
+ InputIteratorT,
1445
+ OutputIteratorT,
1446
+ ScanOpT,
1447
+ detail::InputValue<InitValueT>,
1448
+ OffsetT,
1449
+ AccumT,
1450
+ ForceInclusive::Yes>::Dispatch(d_temp_storage,
1451
+ temp_storage_bytes,
1452
+ d_in,
1453
+ d_out,
1454
+ scan_op,
1455
+ detail::InputValue<InitValueT>(init_value),
1456
+ num_items,
1457
+ stream);
1458
+ }
1459
+
1460
+ //! @rst
1461
+ //! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
1462
+ //!
1463
+ //! - Supports non-commutative scan operators.
1464
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1465
+ //! addition of floating-point types). Results for pseudo-associative
1466
+ //! operators may vary from run to run. Additional details can be found in
1467
+ //! the @lookback description.
1468
+ //! - @devicestorage
1469
+ //!
1470
+ //! Snippet
1471
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1472
+ //!
1473
+ //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
1474
+ //!
1475
+ //! .. code-block:: c++
1476
+ //!
1477
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1478
+ //! #include <cuda/std/climits> // for INT_MAX
1479
+ //!
1480
+ //! // CustomMin functor
1481
+ //! struct CustomMin
1482
+ //! {
1483
+ //! template <typename T>
1484
+ //! __host__ __device__ __forceinline__
1485
+ //! T operator()(const T &a, const T &b) const {
1486
+ //! return (b < a) ? b : a;
1487
+ //! }
1488
+ //! };
1489
+ //!
1490
+ //! // Declare, allocate, and initialize device-accessible pointers for
1491
+ //! // input and output
1492
+ //! int num_items; // e.g., 7
1493
+ //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
1494
+ //! CustomMin min_op;
1495
+ //! ...
1496
+ //!
1497
+ //! // Determine temporary device storage requirements for inclusive
1498
+ //! // prefix scan
1499
+ //! void *d_temp_storage = nullptr;
1500
+ //! size_t temp_storage_bytes = 0;
1501
+ //! cub::DeviceScan::InclusiveScan(
1502
+ //! d_temp_storage, temp_storage_bytes,
1503
+ //! d_data, min_op, num_items);
1504
+ //!
1505
+ //! // Allocate temporary storage for inclusive prefix scan
1506
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1507
+ //!
1508
+ //! // Run inclusive prefix min-scan
1509
+ //! cub::DeviceScan::InclusiveScan(
1510
+ //! d_temp_storage, temp_storage_bytes,
1511
+ //! d_in, d_out, min_op, num_items);
1512
+ //!
1513
+ //! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
1514
+ //!
1515
+ //! @endrst
1516
+ //!
1517
+ //! @tparam IteratorT
1518
+ //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
1519
+ //!
1520
+ //! @tparam ScanOpT
1521
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1522
+ //!
1523
+ //! @tparam NumItemsT
1524
+ //! **[inferred]** An integral type representing the number of input elements
1525
+ //!
1526
+ //! @param[in]
1527
+ //! d_temp_storage Device-accessible allocation of temporary storage.
1528
+ //! When `nullptr`, the required allocation size is written to
1529
+ //! `temp_storage_bytes` and no work is done.
1530
+ //!
1531
+ //! @param[in,out] temp_storage_bytes
1532
+ //! Reference to size in bytes of `d_temp_storage` allocation
1533
+ //!
1534
+ //! @param[in] d_data
1535
+ //! Random-access iterator to the sequence of data items
1536
+ //!
1537
+ //! @param[in] scan_op
1538
+ //! Binary associative scan functor
1539
+ //!
1540
+ //! @param[in] num_items
1541
+ //! Total number of input items (i.e., the length of `d_in`)
1542
+ //!
1543
+ //! @param[in] stream
1544
+ //! @rst
1545
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1546
+ //! @endrst
1547
+ template <typename IteratorT, typename ScanOpT, typename NumItemsT>
1548
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
1549
+ void* d_temp_storage,
1550
+ size_t& temp_storage_bytes,
1551
+ IteratorT d_data,
1552
+ ScanOpT scan_op,
1553
+ NumItemsT num_items,
1554
+ cudaStream_t stream = 0)
1555
+ {
1556
+ return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
1557
+ }
1558
+ //! @} end member group
1559
+
1560
+ //! @name Scans by key
1561
+ //! @{
1562
+
1563
+ //! @rst
1564
+ //! Computes a device-wide exclusive prefix sum-by-key with key equality
1565
+ //! defined by ``equality_op``. The value of ``0`` is applied as the initial
1566
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1567
+ //!
1568
+ //! - Supports non-commutative sum operators.
1569
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1570
+ //! addition of floating-point types). Results for pseudo-associative
1571
+ //! operators may vary from run to run. Additional details can be found in
1572
+ //! the @lookback description.
1573
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1574
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1575
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1576
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1577
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1578
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1579
+ //! - @devicestorage
1580
+ //!
1581
+ //! Snippet
1582
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1583
+ //!
1584
+ //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
1585
+ //!
1586
+ //! .. code-block:: c++
1587
+ //!
1588
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1589
+ //!
1590
+ //! // Declare, allocate, and initialize device-accessible pointers for
1591
+ //! // input and output
1592
+ //! int num_items; // e.g., 7
1593
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1594
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1595
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1596
+ //! ...
1597
+ //!
1598
+ //! // Determine temporary device storage requirements
1599
+ //! void *d_temp_storage = nullptr;
1600
+ //! size_t temp_storage_bytes = 0;
1601
+ //! cub::DeviceScan::ExclusiveSumByKey(
1602
+ //! d_temp_storage, temp_storage_bytes,
1603
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1604
+ //!
1605
+ //! // Allocate temporary storage
1606
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1607
+ //!
1608
+ //! // Run exclusive prefix sum
1609
+ //! cub::DeviceScan::ExclusiveSumByKey(
1610
+ //! d_temp_storage, temp_storage_bytes,
1611
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1612
+ //!
1613
+ //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
1614
+ //!
1615
+ //! @endrst
1616
+ //!
1617
+ //! @tparam KeysInputIteratorT
1618
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1619
+ //!
1620
+ //! @tparam ValuesInputIteratorT
1621
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1622
+ //!
1623
+ //! @tparam ValuesOutputIteratorT
1624
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1625
+ //!
1626
+ //! @tparam EqualityOpT
1627
+ //! **[inferred]** Functor type having member
1628
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1629
+ //!
1630
+ //! @tparam NumItemsT
1631
+ //! **[inferred]** An integral type representing the number of input elements
1632
+ //!
1633
+ //! @param[in] d_temp_storage
1634
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1635
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1636
+ //!
1637
+ //! @param[in,out] temp_storage_bytes
1638
+ //! Reference to size in bytes of `d_temp_storage` allocation
1639
+ //!
1640
+ //! @param[in] d_keys_in
1641
+ //! Random-access input iterator to the input sequence of key items
1642
+ //!
1643
+ //! @param[in] d_values_in
1644
+ //! Random-access input iterator to the input sequence of value items
1645
+ //!
1646
+ //! @param[out] d_values_out
1647
+ //! Random-access output iterator to the output sequence of value items
1648
+ //!
1649
+ //! @param[in] num_items
1650
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1651
+ //!
1652
+ //! @param[in] equality_op
1653
+ //! Binary functor that defines the equality of keys.
1654
+ //! Default is cuda::std::equal_to<>{}.
1655
+ //!
1656
+ //! @param[in] stream
1657
+ //! @rst
1658
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1659
+ //! @endrst
1660
+ template <typename KeysInputIteratorT,
1661
+ typename ValuesInputIteratorT,
1662
+ typename ValuesOutputIteratorT,
1663
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1664
+ typename NumItemsT = uint32_t>
1665
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
1666
+ void* d_temp_storage,
1667
+ size_t& temp_storage_bytes,
1668
+ KeysInputIteratorT d_keys_in,
1669
+ ValuesInputIteratorT d_values_in,
1670
+ ValuesOutputIteratorT d_values_out,
1671
+ NumItemsT num_items,
1672
+ EqualityOpT equality_op = EqualityOpT(),
1673
+ cudaStream_t stream = 0)
1674
+ {
1675
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
1676
+
1677
+ // Unsigned integer type for global offsets
1678
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1679
+ using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
1680
+
1681
+ // Initial value
1682
+ InitT init_value{};
1683
+
1684
+ return DispatchScanByKey<
1685
+ KeysInputIteratorT,
1686
+ ValuesInputIteratorT,
1687
+ ValuesOutputIteratorT,
1688
+ EqualityOpT,
1689
+ ::cuda::std::plus<>,
1690
+ InitT,
1691
+ OffsetT>::Dispatch(d_temp_storage,
1692
+ temp_storage_bytes,
1693
+ d_keys_in,
1694
+ d_values_in,
1695
+ d_values_out,
1696
+ equality_op,
1697
+ ::cuda::std::plus<>{},
1698
+ init_value,
1699
+ num_items,
1700
+ stream);
1701
+ }
1702
+
1703
+ //! @rst
1704
+ //! Computes a device-wide exclusive prefix scan-by-key using the
1705
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by
1706
+ //! ``equality_op``. The ``init_value`` value is applied as the initial
1707
+ //! value, and is assigned to the beginning of each segment in ``d_values_out``.
1708
+ //!
1709
+ //! - Supports non-commutative scan operators.
1710
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1711
+ //! addition of floating-point types). Results for pseudo-associative
1712
+ //! operators may vary from run to run. Additional details can be found in
1713
+ //! the @lookback description.
1714
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1715
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1716
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1717
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1718
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1719
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1720
+ //! - @devicestorage
1721
+ //!
1722
+ //! Snippet
1723
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1724
+ //!
1725
+ //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
1726
+ //!
1727
+ //! .. code-block:: c++
1728
+ //!
1729
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1730
+ //! #include <cuda/std/climits> // for INT_MAX
1731
+ //!
1732
+ //! // CustomMin functor
1733
+ //! struct CustomMin
1734
+ //! {
1735
+ //! template <typename T>
1736
+ //! __host__ __device__ __forceinline__
1737
+ //! T operator()(const T &a, const T &b) const {
1738
+ //! return (b < a) ? b : a;
1739
+ //! }
1740
+ //! };
1741
+ //!
1742
+ //! // CustomEqual functor
1743
+ //! struct CustomEqual
1744
+ //! {
1745
+ //! template <typename T>
1746
+ //! __host__ __device__ __forceinline__
1747
+ //! T operator()(const T &a, const T &b) const {
1748
+ //! return a == b;
1749
+ //! }
1750
+ //! };
1751
+ //!
1752
+ //! // Declare, allocate, and initialize device-accessible pointers for
1753
+ //! // input and output
1754
+ //! int num_items; // e.g., 7
1755
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1756
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1757
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1758
+ //! CustomMin min_op;
1759
+ //! CustomEqual equality_op;
1760
+ //! ...
1761
+ //!
1762
+ //! // Determine temporary device storage requirements for exclusive
1763
+ //! // prefix scan
1764
+ //! void *d_temp_storage = nullptr;
1765
+ //! size_t temp_storage_bytes = 0;
1766
+ //! cub::DeviceScan::ExclusiveScanByKey(
1767
+ //! d_temp_storage, temp_storage_bytes,
1768
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1769
+ //! (int) INT_MAX, num_items, equality_op);
1770
+ //!
1771
+ //! // Allocate temporary storage for exclusive prefix scan
1772
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1773
+ //!
1774
+ //! // Run exclusive prefix min-scan
1775
+ //! cub::DeviceScan::ExclusiveScanByKey(
1776
+ //! d_temp_storage, temp_storage_bytes,
1777
+ //! d_keys_in, d_values_in, d_values_out, min_op,
1778
+ //! (int) INT_MAX, num_items, equality_op);
1779
+ //!
1780
+ //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
1781
+ //!
1782
+ //! @endrst
1783
+ //!
1784
+ //! @tparam KeysInputIteratorT
1785
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1786
+ //!
1787
+ //! @tparam ValuesInputIteratorT
1788
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1789
+ //!
1790
+ //! @tparam ValuesOutputIteratorT
1791
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1792
+ //!
1793
+ //! @tparam ScanOpT
1794
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
1795
+ //!
1796
+ //! @tparam InitValueT
1797
+ //! **[inferred]** Type of the `init_value`
1798
+ //!
1799
+ //! @tparam EqualityOpT
1800
+ //! **[inferred]** Functor type having member
1801
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1802
+ //!
1803
+ //! @tparam NumItemsT
1804
+ //! **[inferred]** An integral type representing the number of input elements
1805
+ //!
1806
+ //! @param[in] d_temp_storage
1807
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1808
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1809
+ //!
1810
+ //! @param[in,out] temp_storage_bytes
1811
+ //! Reference to size in bytes of `d_temp_storage` allocation
1812
+ //!
1813
+ //! @param[in] d_keys_in
1814
+ //! Random-access input iterator to the input sequence of key items
1815
+ //!
1816
+ //! @param[in] d_values_in
1817
+ //! Random-access input iterator to the input sequence of value items
1818
+ //!
1819
+ //! @param[out] d_values_out
1820
+ //! Random-access output iterator to the output sequence of value items
1821
+ //!
1822
+ //! @param[in] scan_op
1823
+ //! Binary associative scan functor
1824
+ //!
1825
+ //! @param[in] init_value
1826
+ //! Initial value to seed the exclusive scan (and is assigned to the
1827
+ //! beginning of each segment in `d_values_out`)
1828
+ //!
1829
+ //! @param[in] num_items
1830
+ //! Total number of input items (i.e., the length of `d_keys_in` and
1831
+ //! `d_values_in`)
1832
+ //!
1833
+ //! @param[in] equality_op
1834
+ //! Binary functor that defines the equality of keys.
1835
+ //! Default is cuda::std::equal_to<>{}.
1836
+ //!
1837
+ //! @param[in] stream
1838
+ //! @rst
1839
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1840
+ //! @endrst
1841
+ template <typename KeysInputIteratorT,
1842
+ typename ValuesInputIteratorT,
1843
+ typename ValuesOutputIteratorT,
1844
+ typename ScanOpT,
1845
+ typename InitValueT,
1846
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1847
+ typename NumItemsT = uint32_t>
1848
+ CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
1849
+ void* d_temp_storage,
1850
+ size_t& temp_storage_bytes,
1851
+ KeysInputIteratorT d_keys_in,
1852
+ ValuesInputIteratorT d_values_in,
1853
+ ValuesOutputIteratorT d_values_out,
1854
+ ScanOpT scan_op,
1855
+ InitValueT init_value,
1856
+ NumItemsT num_items,
1857
+ EqualityOpT equality_op = EqualityOpT(),
1858
+ cudaStream_t stream = 0)
1859
+ {
1860
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
1861
+
1862
+ // Unsigned integer type for global offsets
1863
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1864
+
1865
+ return DispatchScanByKey<
1866
+ KeysInputIteratorT,
1867
+ ValuesInputIteratorT,
1868
+ ValuesOutputIteratorT,
1869
+ EqualityOpT,
1870
+ ScanOpT,
1871
+ InitValueT,
1872
+ OffsetT>::Dispatch(d_temp_storage,
1873
+ temp_storage_bytes,
1874
+ d_keys_in,
1875
+ d_values_in,
1876
+ d_values_out,
1877
+ equality_op,
1878
+ scan_op,
1879
+ init_value,
1880
+ num_items,
1881
+ stream);
1882
+ }
1883
+
1884
+ //! @rst
1885
+ //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
1886
+ //!
1887
+ //! - Supports non-commutative sum operators.
1888
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
1889
+ //! addition of floating-point types). Results for pseudo-associative
1890
+ //! operators may vary from run to run. Additional details can be found in
1891
+ //! the @lookback description.
1892
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
1893
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
1894
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1895
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
1896
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
1897
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
1898
+ //! - @devicestorage
1899
+ //!
1900
+ //! Snippet
1901
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1902
+ //!
1903
+ //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
1904
+ //!
1905
+ //! .. code-block:: c++
1906
+ //!
1907
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
1908
+ //!
1909
+ //! // Declare, allocate, and initialize device-accessible pointers for
1910
+ //! // input and output
1911
+ //! int num_items; // e.g., 7
1912
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
1913
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1914
+ //! int *d_values_out; // e.g., [ , , , , , , ]
1915
+ //! ...
1916
+ //!
1917
+ //! // Determine temporary device storage requirements for inclusive prefix sum
1918
+ //! void *d_temp_storage = nullptr;
1919
+ //! size_t temp_storage_bytes = 0;
1920
+ //! cub::DeviceScan::InclusiveSumByKey(
1921
+ //! d_temp_storage, temp_storage_bytes,
1922
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1923
+ //!
1924
+ //! // Allocate temporary storage for inclusive prefix sum
1925
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1926
+ //!
1927
+ //! // Run inclusive prefix sum
1928
+ //! cub::DeviceScan::InclusiveSumByKey(
1929
+ //! d_temp_storage, temp_storage_bytes,
1930
+ //! d_keys_in, d_values_in, d_values_out, num_items);
1931
+ //!
1932
+ //! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
1933
+ //!
1934
+ //! @endrst
1935
+ //!
1936
+ //! @tparam KeysInputIteratorT
1937
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
1938
+ //!
1939
+ //! @tparam ValuesInputIteratorT
1940
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
1941
+ //!
1942
+ //! @tparam ValuesOutputIteratorT
1943
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
1944
+ //!
1945
+ //! @tparam EqualityOpT
1946
+ //! **[inferred]** Functor type having member
1947
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
1948
+ //!
1949
+ //! @tparam NumItemsT
1950
+ //! **[inferred]** An integral type representing the number of input elements
1951
+ //!
1952
+ //! @param[in] d_temp_storage
1953
+ //! Device-accessible allocation of temporary storage.
1954
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
1955
+ //!
1956
+ //! @param[in,out] temp_storage_bytes
1957
+ //! Reference to size in bytes of `d_temp_storage` allocation
1958
+ //!
1959
+ //! @param[in] d_keys_in
1960
+ //! Random-access input iterator to the input sequence of key items
1961
+ //!
1962
+ //! @param[in] d_values_in
1963
+ //! Random-access input iterator to the input sequence of value items
1964
+ //!
1965
+ //! @param[out] d_values_out
1966
+ //! Random-access output iterator to the output sequence of value items
1967
+ //!
1968
+ //! @param[in] num_items
1969
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
1970
+ //!
1971
+ //! @param[in] equality_op
1972
+ //! Binary functor that defines the equality of keys.
1973
+ //! Default is cuda::std::equal_to<>{}.
1974
+ //!
1975
+ //! @param[in] stream
1976
+ //! @rst
1977
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1978
+ //! @endrst
1979
+ template <typename KeysInputIteratorT,
1980
+ typename ValuesInputIteratorT,
1981
+ typename ValuesOutputIteratorT,
1982
+ typename EqualityOpT = ::cuda::std::equal_to<>,
1983
+ typename NumItemsT = uint32_t>
1984
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
1985
+ void* d_temp_storage,
1986
+ size_t& temp_storage_bytes,
1987
+ KeysInputIteratorT d_keys_in,
1988
+ ValuesInputIteratorT d_values_in,
1989
+ ValuesOutputIteratorT d_values_out,
1990
+ NumItemsT num_items,
1991
+ EqualityOpT equality_op = EqualityOpT(),
1992
+ cudaStream_t stream = 0)
1993
+ {
1994
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
1995
+
1996
+ // Unsigned integer type for global offsets
1997
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1998
+
1999
+ return DispatchScanByKey<
2000
+ KeysInputIteratorT,
2001
+ ValuesInputIteratorT,
2002
+ ValuesOutputIteratorT,
2003
+ EqualityOpT,
2004
+ ::cuda::std::plus<>,
2005
+ NullType,
2006
+ OffsetT>::Dispatch(d_temp_storage,
2007
+ temp_storage_bytes,
2008
+ d_keys_in,
2009
+ d_values_in,
2010
+ d_values_out,
2011
+ equality_op,
2012
+ ::cuda::std::plus<>{},
2013
+ NullType{},
2014
+ num_items,
2015
+ stream);
2016
+ }
2017
+
2018
+ //! @rst
2019
+ //! Computes a device-wide inclusive prefix scan-by-key using the
2020
+ //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
2021
+ //!
2022
+ //! - Supports non-commutative scan operators.
2023
+ //! - Results are not deterministic for pseudo-associative operators (e.g.,
2024
+ //! addition of floating-point types). Results for pseudo-associative
2025
+ //! operators may vary from run to run. Additional details can be found in
2026
+ //! the @lookback description.
2027
+ //! - ``d_keys_in`` may equal ``d_values_out`` but the range
2028
+ //! ``[d_keys_in, d_keys_in + num_items)`` and the range
2029
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2030
+ //! - ``d_values_in`` may equal ``d_values_out`` but the range
2031
+ //! ``[d_values_in, d_values_in + num_items)`` and the range
2032
+ //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
2033
+ //! - @devicestorage
2034
+ //!
2035
+ //! Snippet
2036
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2037
+ //!
2038
+ //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
2039
+ //!
2040
+ //! .. code-block:: c++
2041
+ //!
2042
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
2043
+ //! #include <cuda/std/climits> // for INT_MAX
2044
+ //!
2045
+ //! // CustomMin functor
2046
+ //! struct CustomMin
2047
+ //! {
2048
+ //! template <typename T>
2049
+ //! __host__ __device__ __forceinline__
2050
+ //! T operator()(const T &a, const T &b) const {
2051
+ //! return (b < a) ? b : a;
2052
+ //! }
2053
+ //! };
2054
+ //!
2055
+ //! // CustomEqual functor
2056
+ //! struct CustomEqual
2057
+ //! {
2058
+ //! template <typename T>
2059
+ //! __host__ __device__ __forceinline__
2060
+ //! T operator()(const T &a, const T &b) const {
2061
+ //! return a == b;
2062
+ //! }
2063
+ //! };
2064
+ //!
2065
+ //! // Declare, allocate, and initialize device-accessible pointers for
2066
+ //! // input and output
2067
+ //! int num_items; // e.g., 7
2068
+ //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
2069
+ //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2070
+ //! int *d_values_out; // e.g., [ , , , , , , ]
2071
+ //! CustomMin min_op;
2072
+ //! CustomEqual equality_op;
2073
+ //! ...
2074
+ //!
2075
+ //! // Determine temporary device storage requirements for inclusive prefix scan
2076
+ //! void *d_temp_storage = nullptr;
2077
+ //! size_t temp_storage_bytes = 0;
2078
+ //! cub::DeviceScan::InclusiveScanByKey(
2079
+ //! d_temp_storage, temp_storage_bytes,
2080
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2081
+ //!
2082
+ //! // Allocate temporary storage for inclusive prefix scan
2083
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2084
+ //!
2085
+ //! // Run inclusive prefix min-scan
2086
+ //! cub::DeviceScan::InclusiveScanByKey(
2087
+ //! d_temp_storage, temp_storage_bytes,
2088
+ //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
2089
+ //!
2090
+ //! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
2091
+ //!
2092
+ //! @endrst
2093
+ //!
2094
+ //! @tparam KeysInputIteratorT
2095
+ //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
2096
+ //!
2097
+ //! @tparam ValuesInputIteratorT
2098
+ //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
2099
+ //!
2100
+ //! @tparam ValuesOutputIteratorT
2101
+ //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
2102
+ //!
2103
+ //! @tparam ScanOpT
2104
+ //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
2105
+ //!
2106
+ //! @tparam EqualityOpT
2107
+ //! **[inferred]** Functor type having member
2108
+ //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
2109
+ //!
2110
+ //! @tparam NumItemsT
2111
+ //! **[inferred]** An integral type representing the number of input elements
2112
+ //!
2113
+ //! @param[in] d_temp_storage
2114
+ //! Device-accessible allocation of temporary storage.
2115
+ //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
2116
+ //!
2117
+ //! @param[in,out] temp_storage_bytes
2118
+ //! Reference to size in bytes of `d_temp_storage` allocation
2119
+ //!
2120
+ //! @param[in] d_keys_in
2121
+ //! Random-access input iterator to the input sequence of key items
2122
+ //!
2123
+ //! @param[in] d_values_in
2124
+ //! Random-access input iterator to the input sequence of value items
2125
+ //!
2126
+ //! @param[out] d_values_out
2127
+ //! Random-access output iterator to the output sequence of value items
2128
+ //!
2129
+ //! @param[in] scan_op
2130
+ //! Binary associative scan functor
2131
+ //!
2132
+ //! @param[in] num_items
2133
+ //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
2134
+ //!
2135
+ //! @param[in] equality_op
2136
+ //! Binary functor that defines the equality of keys.
2137
+ //! Default is cuda::std::equal_to<>{}.
2138
+ //!
2139
+ //! @param[in] stream
2140
+ //! @rst
2141
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2142
+ //! @endrst
2143
+ template <typename KeysInputIteratorT,
2144
+ typename ValuesInputIteratorT,
2145
+ typename ValuesOutputIteratorT,
2146
+ typename ScanOpT,
2147
+ typename EqualityOpT = ::cuda::std::equal_to<>,
2148
+ typename NumItemsT = uint32_t>
2149
+ CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
2150
+ void* d_temp_storage,
2151
+ size_t& temp_storage_bytes,
2152
+ KeysInputIteratorT d_keys_in,
2153
+ ValuesInputIteratorT d_values_in,
2154
+ ValuesOutputIteratorT d_values_out,
2155
+ ScanOpT scan_op,
2156
+ NumItemsT num_items,
2157
+ EqualityOpT equality_op = EqualityOpT(),
2158
+ cudaStream_t stream = 0)
2159
+ {
2160
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
2161
+
2162
+ // Unsigned integer type for global offsets
2163
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2164
+
2165
+ return DispatchScanByKey<
2166
+ KeysInputIteratorT,
2167
+ ValuesInputIteratorT,
2168
+ ValuesOutputIteratorT,
2169
+ EqualityOpT,
2170
+ ScanOpT,
2171
+ NullType,
2172
+ OffsetT>::Dispatch(d_temp_storage,
2173
+ temp_storage_bytes,
2174
+ d_keys_in,
2175
+ d_values_in,
2176
+ d_values_out,
2177
+ equality_op,
2178
+ scan_op,
2179
+ NullType(),
2180
+ num_items,
2181
+ stream);
2182
+ }
2183
+
2184
+ //! @} end member group
2185
+ };
2186
+
2187
+ CUB_NAMESPACE_END