cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2497 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
7
+ //! items residing within device-accessible memory.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/detail/choose_offset.cuh>
22
+ #include <cub/detail/device_memory_resource.cuh>
23
+ #include <cub/detail/temporary_storage.cuh>
24
+ #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
25
+ #include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
26
+ #include <cub/device/dispatch/dispatch_reduce_nondeterministic.cuh>
27
+ #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
28
+ #include <cub/thread/thread_operators.cuh>
29
+ #include <cub/util_type.cuh>
30
+
31
+ #include <cuda/__execution/determinism.h>
32
+ #include <cuda/__execution/require.h>
33
+ #include <cuda/__execution/tune.h>
34
+ #include <cuda/__functional/maximum.h>
35
+ #include <cuda/__functional/minimum.h>
36
+ #include <cuda/__iterator/tabulate_output_iterator.h>
37
+ #include <cuda/__memory_resource/get_memory_resource.h>
38
+ #include <cuda/__stream/get_stream.h>
39
+ #include <cuda/__stream/stream_ref.h>
40
+ #include <cuda/std/__execution/env.h>
41
+ #include <cuda/std/__functional/identity.h>
42
+ #include <cuda/std/__functional/invoke.h>
43
+ #include <cuda/std/__functional/operations.h>
44
+ #include <cuda/std/__type_traits/conditional.h>
45
+ #include <cuda/std/__type_traits/is_integral.h>
46
+ #include <cuda/std/__type_traits/is_same.h>
47
+ #include <cuda/std/cstdint>
48
+ #include <cuda/std/limits>
49
+
50
+ CUB_NAMESPACE_BEGIN
51
+
52
+ namespace detail
53
+ {
54
+ template <typename DeterminismT>
55
+ inline constexpr bool is_non_deterministic_v =
56
+ ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
57
+
58
+ namespace reduce
59
+ {
60
+ struct get_tuning_query_t
61
+ {};
62
+
63
+ template <class Derived>
64
+ struct tuning
65
+ {
66
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
67
+ {
68
+ return static_cast<const Derived&>(*this);
69
+ }
70
+ };
71
+
72
+ struct default_tuning : tuning<default_tuning>
73
+ {
74
+ template <class AccumT, class Offset, class OpT>
75
+ using fn = policy_hub<AccumT, Offset, OpT>;
76
+ };
77
+
78
+ struct default_rfa_tuning : tuning<default_tuning>
79
+ {
80
+ template <class AccumT, class Offset, class OpT>
81
+ using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
82
+ };
83
+
84
+ template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
85
+ struct unzip_and_write_arg_extremum_op
86
+ {
87
+ ExtremumOutIteratorT result_out_it;
88
+ IndexOutIteratorT index_out_it;
89
+
90
+ template <typename IndexT, typename KeyValuePairT>
91
+ _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
92
+ {
93
+ *result_out_it = reduced_result.value;
94
+ *index_out_it = reduced_result.key;
95
+ }
96
+ };
97
+ } // namespace reduce
98
+ } // namespace detail
99
+
100
+ //! @rst
101
+ //! DeviceReduce provides device-wide, parallel operations for computing
102
+ //! a reduction across a sequence of data items residing within
103
+ //! device-accessible memory.
104
+ //!
105
+ //! .. image:: ../../img/reduce_logo.png
106
+ //! :align: center
107
+ //!
108
+ //! Overview
109
+ //! ====================================
110
+ //!
111
+ //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
112
+ //! (or *fold*) uses a binary combining operator to compute a single aggregate
113
+ //! from a sequence of input elements.
114
+ //!
115
+ //! Usage Considerations
116
+ //! ====================================
117
+ //!
118
+ //! @cdp_class{DeviceReduce}
119
+ //!
120
+ //! Performance
121
+ //! ====================================
122
+ //!
123
+ //! @linear_performance{reduction, reduce-by-key, and run-length encode}
124
+ //!
125
+ //! @endrst
126
+ struct DeviceReduce
127
+ {
128
+ private:
129
+ template <typename TuningEnvT,
130
+ typename InputIteratorT,
131
+ typename OutputIteratorT,
132
+ typename ReductionOpT,
133
+ typename TransformOpT,
134
+ typename T,
135
+ typename NumItemsT,
136
+ ::cuda::execution::determinism::__determinism_t Determinism>
137
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
138
+ void* d_temp_storage,
139
+ size_t& temp_storage_bytes,
140
+ InputIteratorT d_in,
141
+ OutputIteratorT d_out,
142
+ NumItemsT num_items,
143
+ ReductionOpT reduction_op,
144
+ TransformOpT transform_op,
145
+ T init,
146
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
147
+ cudaStream_t stream)
148
+ {
149
+ using offset_t = detail::choose_offset_t<NumItemsT>;
150
+ using reduce_tuning_t = ::cuda::std::execution::
151
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
152
+
153
+ using accum_t = ::cuda::std::
154
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
155
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
156
+
157
+ using dispatch_t =
158
+ DispatchTransformReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, TransformOpT, T, accum_t, policy_t>;
159
+
160
+ return dispatch_t::Dispatch(
161
+ d_temp_storage,
162
+ temp_storage_bytes,
163
+ d_in,
164
+ d_out,
165
+ static_cast<offset_t>(num_items),
166
+ reduction_op,
167
+ init,
168
+ stream,
169
+ transform_op);
170
+ }
171
+
172
+ template <typename TuningEnvT,
173
+ typename InputIteratorT,
174
+ typename OutputIteratorT,
175
+ typename ReductionOpT,
176
+ typename TransformOpT,
177
+ typename T,
178
+ typename NumItemsT>
179
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
180
+ void* d_temp_storage,
181
+ size_t& temp_storage_bytes,
182
+ InputIteratorT d_in,
183
+ OutputIteratorT d_out,
184
+ NumItemsT num_items,
185
+ ReductionOpT,
186
+ TransformOpT transform_op,
187
+ T init,
188
+ ::cuda::execution::determinism::gpu_to_gpu_t,
189
+ cudaStream_t stream)
190
+ {
191
+ using offset_t = detail::choose_offset_t<NumItemsT>;
192
+
193
+ using reduce_tuning_t = ::cuda::std::execution::
194
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
195
+
196
+ using accum_t = ::cuda::std::
197
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
198
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
199
+ using dispatch_t =
200
+ detail::DispatchReduceDeterministic<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>;
201
+
202
+ return dispatch_t::Dispatch(
203
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
204
+ }
205
+
206
+ template <typename TuningEnvT,
207
+ typename InputIteratorT,
208
+ typename OutputIteratorT,
209
+ typename ReductionOpT,
210
+ typename TransformOpT,
211
+ typename T,
212
+ typename NumItemsT>
213
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
214
+ void* d_temp_storage,
215
+ size_t& temp_storage_bytes,
216
+ InputIteratorT d_in,
217
+ OutputIteratorT d_out,
218
+ NumItemsT num_items,
219
+ ReductionOpT reduction_op,
220
+ TransformOpT transform_op,
221
+ T init,
222
+ ::cuda::execution::determinism::not_guaranteed_t,
223
+ cudaStream_t stream)
224
+ {
225
+ using offset_t = detail::choose_offset_t<NumItemsT>;
226
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
227
+
228
+ using output_t = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator_t<OutputIteratorT>;
229
+
230
+ using reduce_tuning_t = ::cuda::std::execution::
231
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
232
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
233
+ using dispatch_t = detail::
234
+ DispatchReduceNondeterministic<InputIteratorT, output_t, offset_t, ReductionOpT, T, accum_t, TransformOpT, policy_t>;
235
+
236
+ return dispatch_t::Dispatch(
237
+ d_temp_storage,
238
+ temp_storage_bytes,
239
+ d_in,
240
+ THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(d_out),
241
+ static_cast<offset_t>(num_items),
242
+ reduction_op,
243
+ init,
244
+ stream,
245
+ transform_op);
246
+ }
247
+
248
+ public:
249
+ //! @rst
250
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
251
+ //!
252
+ //! - Does not support binary reduction operators that are non-commutative.
253
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
254
+ //! (e.g., addition of floating point types) on the same GPU device.
255
+ //! However, results for pseudo-associative reduction may be inconsistent
256
+ //! from one device to a another device of a different compute-capability
257
+ //! because CUB can employ different tile-sizing for different architectures.
258
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
259
+ //! - @devicestorage
260
+ //!
261
+ //! Snippet
262
+ //! +++++++++++++++++++++++++++++++++++++++++++++
263
+ //!
264
+ //! The code snippet below illustrates a user-defined min-reduction of a
265
+ //! device vector of ``int`` data elements.
266
+ //!
267
+ //! .. code-block:: c++
268
+ //!
269
+ //! #include <cub/cub.cuh>
270
+ //! // or equivalently <cub/device/device_reduce.cuh>
271
+ //!
272
+ //! // CustomMin functor
273
+ //! struct CustomMin
274
+ //! {
275
+ //! template <typename T>
276
+ //! __device__ __forceinline__
277
+ //! T operator()(const T &a, const T &b) const {
278
+ //! return (b < a) ? b : a;
279
+ //! }
280
+ //! };
281
+ //!
282
+ //! // Declare, allocate, and initialize device-accessible pointers for
283
+ //! // input and output
284
+ //! int num_items; // e.g., 7
285
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
286
+ //! int *d_out; // e.g., [-]
287
+ //! CustomMin min_op;
288
+ //! int init; // e.g., INT_MAX
289
+ //! ...
290
+ //!
291
+ //! // Determine temporary device storage requirements
292
+ //! void *d_temp_storage = nullptr;
293
+ //! size_t temp_storage_bytes = 0;
294
+ //! cub::DeviceReduce::Reduce(
295
+ //! d_temp_storage, temp_storage_bytes,
296
+ //! d_in, d_out, num_items, min_op, init);
297
+ //!
298
+ //! // Allocate temporary storage
299
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
300
+ //!
301
+ //! // Run reduction
302
+ //! cub::DeviceReduce::Reduce(
303
+ //! d_temp_storage, temp_storage_bytes,
304
+ //! d_in, d_out, num_items, min_op, init);
305
+ //!
306
+ //! // d_out <-- [0]
307
+ //!
308
+ //! @endrst
309
+ //!
310
+ //! @tparam InputIteratorT
311
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
312
+ //!
313
+ //! @tparam OutputIteratorT
314
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
315
+ //!
316
+ //! @tparam ReductionOpT
317
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
318
+ //!
319
+ //! @tparam T
320
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
321
+ //!
322
+ //! @tparam NumItemsT
323
+ //! **[inferred]** Type of num_items
324
+ //!
325
+ //! @param[in] d_temp_storage
326
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
327
+ //! required allocation size is written to `temp_storage_bytes` and no work
328
+ //! is done.
329
+ //!
330
+ //! @param[in,out] temp_storage_bytes
331
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
332
+ //!
333
+ //! @param[in] d_in
334
+ //! Pointer to the input sequence of data items
335
+ //!
336
+ //! @param[out] d_out
337
+ //! Pointer to the output aggregate
338
+ //!
339
+ //! @param[in] num_items
340
+ //! Total number of input items (i.e., length of ``d_in``)
341
+ //!
342
+ //! @param[in] reduction_op
343
+ //! Binary reduction functor
344
+ //!
345
+ //! @param[in] init
346
+ //! Initial value of the reduction
347
+ //!
348
+ //! @param[in] stream
349
+ //! @rst
350
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
351
+ //! @endrst
352
+ template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
353
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
354
+ void* d_temp_storage,
355
+ size_t& temp_storage_bytes,
356
+ InputIteratorT d_in,
357
+ OutputIteratorT d_out,
358
+ NumItemsT num_items,
359
+ ReductionOpT reduction_op,
360
+ T init,
361
+ cudaStream_t stream = 0)
362
+ {
363
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
364
+
365
+ // Signed integer type for global offsets
366
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
367
+
368
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
369
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
370
+ }
371
+
372
+ //! @rst
373
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
374
+ //!
375
+ //! - Does not support binary reduction operators that are non-commutative.
376
+ //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
377
+ //! (e.g., addition of floating point types) on the same GPU device.
378
+ //! However, results for pseudo-associative reduction may be inconsistent
379
+ //! from one device to a another device of a different compute-capability
380
+ //! because CUB can employ different tile-sizing for different architectures.
381
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
382
+ //! as the `env` parameter.
383
+ //! To request "not-guaranteed" determinism, pass
384
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
385
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
386
+ //!
387
+ //! Snippet
388
+ //! +++++++++++++++++++++++++++++++++++++++++++++
389
+ //!
390
+ //! The code snippet below illustrates a user-defined min-reduction of a
391
+ //! device vector of ``int`` data elements.
392
+ //!
393
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
394
+ //! :language: c++
395
+ //! :dedent:
396
+ //! :start-after: example-begin reduce-env-determinism
397
+ //! :end-before: example-end reduce-env-determinism
398
+ //!
399
+ //! @endrst
400
+ //!
401
+ //! @tparam InputIteratorT
402
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
403
+ //!
404
+ //! @tparam OutputIteratorT
405
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
406
+ //!
407
+ //! @tparam ReductionOpT
408
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
409
+ //!
410
+ //! @tparam T
411
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
412
+ //!
413
+ //! @tparam NumItemsT
414
+ //! **[inferred]** Type of num_items
415
+ //!
416
+ //! @tparam EnvT
417
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
418
+ //!
419
+ //! @param[in] d_in
420
+ //! Pointer to the input sequence of data items
421
+ //!
422
+ //! @param[out] d_out
423
+ //! Pointer to the output aggregate
424
+ //!
425
+ //! @param[in] num_items
426
+ //! Total number of input items (i.e., length of ``d_in``)
427
+ //!
428
+ //! @param[in] reduction_op
429
+ //! Binary reduction functor
430
+ //!
431
+ //! @param[in] init
432
+ //! Initial value of the reduction
433
+ //!
434
+ //! @param[in] env
435
+ //! @rst
436
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
437
+ //! @endrst
438
+ template <typename InputIteratorT,
439
+ typename OutputIteratorT,
440
+ typename ReductionOpT,
441
+ typename T,
442
+ typename NumItemsT,
443
+ typename EnvT = ::cuda::std::execution::env<>>
444
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
445
+ InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
446
+ {
447
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
448
+
449
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
450
+ "Determinism should be used inside requires to have an effect.");
451
+ using requirements_t = ::cuda::std::execution::
452
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
453
+ using default_determinism_t =
454
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
455
+ ::cuda::execution::determinism::__get_determinism_t,
456
+ ::cuda::execution::determinism::run_to_run_t>;
457
+
458
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
459
+
460
+ constexpr auto gpu_gpu_determinism =
461
+ ::cuda::std::is_same_v<default_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>;
462
+
463
+ // integral types are always gpu-to-gpu deterministic if reduction operator is a simple cuda binary
464
+ // operator, so fallback to run-to-run determinism
465
+ constexpr auto integral_fallback =
466
+ gpu_gpu_determinism && ::cuda::std::is_integral_v<accum_t> && (detail::is_cuda_binary_operator<ReductionOpT>);
467
+
468
+ // use gpu-to-gpu determinism only for float and double types with ::cuda::std::plus operator
469
+ constexpr auto float_double_plus =
470
+ gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
471
+
472
+ constexpr auto float_double_min_max_fallback =
473
+ gpu_gpu_determinism
474
+ && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_minimum_maximum_v<ReductionOpT>;
475
+
476
+ constexpr auto supported =
477
+ integral_fallback || float_double_plus || float_double_min_max_fallback || !gpu_gpu_determinism;
478
+
479
+ // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
480
+ // float and double types with ::cuda::std::plus operator
481
+ static_assert(supported, "gpu_to_gpu determinism is unsupported");
482
+
483
+ if constexpr (!supported)
484
+ {
485
+ return cudaErrorNotSupported;
486
+ }
487
+ else
488
+ {
489
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
490
+
491
+ // Certain conditions must be met to be able to use the non-deterministic
492
+ // kernel. The output iterator must be a contiguous iterator and the
493
+ // reduction operator must be plus (for now). Additionally, since atomics for types of
494
+ // size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
495
+ // determinism.
496
+ constexpr auto is_contiguous_fallback =
497
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
498
+ constexpr auto is_plus_fallback = !no_determinism || detail::is_cuda_std_plus_v<ReductionOpT>;
499
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(accum_t) >= 4;
500
+
501
+ // If the conditions for gpu-to-gpu determinism or non-deterministic
502
+ // reduction are not met, we fall back to run-to-run determinism.
503
+ using determinism_t = ::cuda::std::conditional_t<
504
+ (gpu_gpu_determinism && (integral_fallback || float_double_min_max_fallback))
505
+ || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
506
+ ::cuda::execution::determinism::run_to_run_t,
507
+ default_determinism_t>;
508
+
509
+ // Query relevant properties from the environment
510
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
511
+ auto mr =
512
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
513
+
514
+ void* d_temp_storage = nullptr;
515
+ size_t temp_storage_bytes = 0;
516
+
517
+ using tuning_t = ::cuda::std::execution::
518
+ __query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
519
+
520
+ // Query the required temporary storage size
521
+ cudaError_t error = reduce_impl<tuning_t>(
522
+ d_temp_storage,
523
+ temp_storage_bytes,
524
+ d_in,
525
+ d_out,
526
+ num_items,
527
+ reduction_op,
528
+ ::cuda::std::identity{},
529
+ init,
530
+ determinism_t{},
531
+ stream.get());
532
+ if (error != cudaSuccess)
533
+ {
534
+ return error;
535
+ }
536
+
537
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
538
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
539
+ if (error != cudaSuccess)
540
+ {
541
+ return error;
542
+ }
543
+
544
+ // Run the algorithm
545
+ error = reduce_impl<tuning_t>(
546
+ d_temp_storage,
547
+ temp_storage_bytes,
548
+ d_in,
549
+ d_out,
550
+ num_items,
551
+ reduction_op,
552
+ ::cuda::std::identity{},
553
+ init,
554
+ determinism_t{},
555
+ stream.get());
556
+
557
+ // Try to deallocate regardless of the error to avoid memory leaks
558
+ cudaError_t deallocate_error =
559
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
560
+
561
+ if (error != cudaSuccess)
562
+ {
563
+ // Reduction error takes precedence over deallocation error since it happens first
564
+ return error;
565
+ }
566
+
567
+ return deallocate_error;
568
+ }
569
+ }
570
+
571
+ //! @rst
572
+ //! Computes a device-wide sum using the addition (``+``) operator.
573
+ //!
574
+ //! - Uses ``0`` as the initial value of the reduction.
575
+ //! - Does not support ``+`` operators that are non-commutative.
576
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
577
+ //! (e.g., addition of floating point types) on the same GPU device.
578
+ //! However, results for pseudo-associative reduction may be inconsistent
579
+ //! from one device to a another device of a different compute-capability
580
+ //! because CUB can employ different tile-sizing for different architectures.
581
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
582
+ //! as the `env` parameter.
583
+ //! To request "not-guaranteed" determinism, pass
584
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
585
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
586
+ //!
587
+ //! Snippet
588
+ //! +++++++++++++++++++++++++++++++++++++++++++++
589
+ //!
590
+ //! The code snippet below illustrates a user-defined min-reduction of a
591
+ //! device vector of ``int`` data elements.
592
+ //!
593
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
594
+ //! :language: c++
595
+ //! :dedent:
596
+ //! :start-after: example-begin sum-env-determinism
597
+ //! :end-before: example-end sum-env-determinism
598
+ //!
599
+ //! @endrst
600
+ //!
601
+ //! @tparam InputIteratorT
602
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
603
+ //!
604
+ //! @tparam OutputIteratorT
605
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
606
+ //!
607
+ //! @tparam NumItemsT
608
+ //! **[inferred]** Type of num_items
609
+ //!
610
+ //! @tparam EnvT
611
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
612
+ //!
613
+ //! @param[in] d_in
614
+ //! Pointer to the input sequence of data items
615
+ //!
616
+ //! @param[out] d_out
617
+ //! Pointer to the output aggregate
618
+ //!
619
+ //! @param[in] num_items
620
+ //! Total number of input items (i.e., length of ``d_in``)
621
+ //!
622
+ //! @param[in] env
623
+ //! @rst
624
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
625
+ //! @endrst
626
+ template <typename InputIteratorT,
627
+ typename OutputIteratorT,
628
+ typename NumItemsT,
629
+ typename EnvT = ::cuda::std::execution::env<>>
630
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
631
+ Sum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
632
+ {
633
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
634
+
635
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
636
+ "Determinism should be used inside requires to have an effect.");
637
+ using requirements_t = ::cuda::std::execution::
638
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
639
+ using default_determinism_t =
640
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
641
+ ::cuda::execution::determinism::__get_determinism_t,
642
+ ::cuda::execution::determinism::run_to_run_t>;
643
+
644
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
645
+
646
+ // The output iterator must be a contiguous iterator or we fall back to
647
+ // run-to-run determinism.
648
+ constexpr auto is_contiguous_fallback =
649
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
650
+
651
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
652
+
653
+ // Since atomics for types of size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
654
+ // determinism.
655
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(OutputT) >= 4;
656
+
657
+ using determinism_t =
658
+ ::cuda::std::conditional_t<no_determinism && !(is_contiguous_fallback && is_4b_or_greater),
659
+ ::cuda::execution::determinism::run_to_run_t,
660
+ default_determinism_t>;
661
+
662
+ // Query relevant properties from the environment
663
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
664
+ auto mr =
665
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
666
+
667
+ void* d_temp_storage = nullptr;
668
+ size_t temp_storage_bytes = 0;
669
+
670
+ using tuning_t =
671
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
672
+
673
+ using InitT = OutputT;
674
+
675
+ // Query the required temporary storage size
676
+ cudaError_t error = reduce_impl<tuning_t>(
677
+ d_temp_storage,
678
+ temp_storage_bytes,
679
+ d_in,
680
+ d_out,
681
+ num_items,
682
+ ::cuda::std::plus<>{},
683
+ ::cuda::std::identity{},
684
+ InitT{}, // zero-initialize
685
+ determinism_t{},
686
+ stream.get());
687
+ if (error != cudaSuccess)
688
+ {
689
+ return error;
690
+ }
691
+
692
+ // TODO(gevtushenko): use uninitialized buffer when it's available
693
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
694
+ if (error != cudaSuccess)
695
+ {
696
+ return error;
697
+ }
698
+
699
+ // Run the algorithm
700
+ error = reduce_impl<tuning_t>(
701
+ d_temp_storage,
702
+ temp_storage_bytes,
703
+ d_in,
704
+ d_out,
705
+ num_items,
706
+ ::cuda::std::plus<>{},
707
+ ::cuda::std::identity{},
708
+ InitT{}, // zero-initialize
709
+ determinism_t{},
710
+ stream.get());
711
+
712
+ // Try to deallocate regardless of the error to avoid memory leaks
713
+ cudaError_t deallocate_error =
714
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
715
+
716
+ if (error != cudaSuccess)
717
+ {
718
+ // Reduction error takes precedence over deallocation error since it happens first
719
+ return error;
720
+ }
721
+
722
+ return deallocate_error;
723
+ }
724
+
725
+ //! @rst
726
+ //! Computes a device-wide sum using the addition (``+``) operator.
727
+ //!
728
+ //! - Uses ``0`` as the initial value of the reduction.
729
+ //! - Does not support ``+`` operators that are non-commutative.
730
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
731
+ //! (e.g., addition of floating point types) on the same GPU device.
732
+ //! However, results for pseudo-associative reduction may be inconsistent
733
+ //! from one device to a another device of a different compute-capability
734
+ //! because CUB can employ different tile-sizing for different architectures.
735
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
736
+ //! - @devicestorage
737
+ //!
738
+ //! Snippet
739
+ //! +++++++++++++++++++++++++++++++++++++++++++++
740
+ //!
741
+ //! The code snippet below illustrates the sum-reduction of a device vector
742
+ //! of ``int`` data elements.
743
+ //!
744
+ //! .. code-block:: c++
745
+ //!
746
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
747
+ //!
748
+ //! // Declare, allocate, and initialize device-accessible pointers
749
+ //! // for input and output
750
+ //! int num_items; // e.g., 7
751
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
752
+ //! int *d_out; // e.g., [-]
753
+ //! ...
754
+ //!
755
+ //! // Determine temporary device storage requirements
756
+ //! void *d_temp_storage = nullptr;
757
+ //! size_t temp_storage_bytes = 0;
758
+ //! cub::DeviceReduce::Sum(
759
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
760
+ //!
761
+ //! // Allocate temporary storage
762
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
763
+ //!
764
+ //! // Run sum-reduction
765
+ //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
766
+ //!
767
+ //! // d_out <-- [38]
768
+ //!
769
+ //! @endrst
770
+ //!
771
+ //! @tparam InputIteratorT
772
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
773
+ //!
774
+ //! @tparam OutputIteratorT
775
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
776
+ //!
777
+ //! @tparam NumItemsT
778
+ //! **[inferred]** Type of num_items
779
+ //!
780
+ //! @param[in] d_temp_storage
781
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
782
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
783
+ //!
784
+ //! @param[in,out] temp_storage_bytes
785
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
786
+ //!
787
+ //! @param[in] d_in
788
+ //! Pointer to the input sequence of data items
789
+ //!
790
+ //! @param[out] d_out
791
+ //! Pointer to the output aggregate
792
+ //!
793
+ //! @param[in] num_items
794
+ //! Total number of input items (i.e., length of `d_in`)
795
+ //!
796
+ //! @param[in] stream
797
+ //! @rst
798
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
799
+ //! @endrst
800
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
801
+ CUB_RUNTIME_FUNCTION static cudaError_t
802
+ Sum(void* d_temp_storage,
803
+ size_t& temp_storage_bytes,
804
+ InputIteratorT d_in,
805
+ OutputIteratorT d_out,
806
+ NumItemsT num_items,
807
+ cudaStream_t stream = 0)
808
+ {
809
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
810
+
811
+ // Signed integer type for global offsets
812
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
813
+
814
+ // The output value type
815
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
816
+
817
+ using InitT = OutputT;
818
+
819
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::std::plus<>, InitT>::Dispatch(
820
+ d_temp_storage,
821
+ temp_storage_bytes,
822
+ d_in,
823
+ d_out,
824
+ static_cast<OffsetT>(num_items),
825
+ ::cuda::std::plus<>{},
826
+ InitT{}, // zero-initialize
827
+ stream);
828
+ }
829
+
830
+ //! @rst
831
+ //! Computes a device-wide minimum using the less-than (``<``) operator.
832
+ //!
833
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
834
+ //! - Does not support ``<`` operators that are non-commutative.
835
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
836
+ //! (e.g., addition of floating point types) on the same GPU device.
837
+ //! However, results for pseudo-associative reduction may be inconsistent
838
+ //! from one device to a another device of a different compute-capability
839
+ //! because CUB can employ different tile-sizing for different architectures.
840
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
841
+ //! - @devicestorage
842
+ //!
843
+ //! Snippet
844
+ //! +++++++++++++++++++++++++++++++++++++++++++++
845
+ //!
846
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
847
+ //!
848
+ //! .. code-block:: c++
849
+ //!
850
+ //! #include <cub/cub.cuh>
851
+ //! // or equivalently <cub/device/device_reduce.cuh>
852
+ //!
853
+ //! // Declare, allocate, and initialize device-accessible pointers
854
+ //! // for input and output
855
+ //! int num_items; // e.g., 7
856
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
857
+ //! int *d_out; // e.g., [-]
858
+ //! ...
859
+ //!
860
+ //! // Determine temporary device storage requirements
861
+ //! void *d_temp_storage = nullptr;
862
+ //! size_t temp_storage_bytes = 0;
863
+ //! cub::DeviceReduce::Min(
864
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
865
+ //!
866
+ //! // Allocate temporary storage
867
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
868
+ //!
869
+ //! // Run min-reduction
870
+ //! cub::DeviceReduce::Min(
871
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
872
+ //!
873
+ //! // d_out <-- [0]
874
+ //!
875
+ //! @endrst
876
+ //!
877
+ //! @tparam InputIteratorT
878
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
879
+ //!
880
+ //! @tparam OutputIteratorT
881
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
882
+ //!
883
+ //! @tparam NumItemsT
884
+ //! **[inferred]** Type of num_items
885
+ //!
886
+ //! @param[in] d_temp_storage
887
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
888
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
889
+ //!
890
+ //! @param[in,out] temp_storage_bytes
891
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
892
+ //!
893
+ //! @param[in] d_in
894
+ //! Pointer to the input sequence of data items
895
+ //!
896
+ //! @param[out] d_out
897
+ //! Pointer to the output aggregate
898
+ //!
899
+ //! @param[in] num_items
900
+ //! Total number of input items (i.e., length of ``d_in``)
901
+ //!
902
+ //! @param[in] stream
903
+ //! @rst
904
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
905
+ //! @endrst
906
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
907
+ CUB_RUNTIME_FUNCTION static cudaError_t
908
+ Min(void* d_temp_storage,
909
+ size_t& temp_storage_bytes,
910
+ InputIteratorT d_in,
911
+ OutputIteratorT d_out,
912
+ NumItemsT num_items,
913
+ cudaStream_t stream = 0)
914
+ {
915
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
916
+
917
+ using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
918
+ using InputT = detail::it_value_t<InputIteratorT>;
919
+ using InitT = InputT;
920
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
921
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
922
+ static_assert(limits_t::is_specialized,
923
+ "cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
924
+ "value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This is "
925
+ "probably a bug and you should specialize cuda::std::numeric_limits. Define "
926
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
927
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
928
+
929
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::minimum<>, InitT>::Dispatch(
930
+ d_temp_storage,
931
+ temp_storage_bytes,
932
+ d_in,
933
+ d_out,
934
+ static_cast<OffsetT>(num_items),
935
+ ::cuda::minimum<>{},
936
+ limits_t::max(),
937
+ stream);
938
+ }
939
+
940
+ //! @rst
941
+ //! Computes a device-wide minimum using the less-than (``<``) operator. The result is written to the output
942
+ //! iterator.
943
+ //!
944
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
945
+ //! - Provides determinism based on the environment's determinism requirements.
946
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
947
+ //! as the `env` parameter.
948
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
949
+ //!
950
+ //! Snippet
951
+ //! +++++++++++++++++++++++++++++++++++++++++++++
952
+ //!
953
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
954
+ //!
955
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
956
+ //! :language: c++
957
+ //! :dedent:
958
+ //! :start-after: example-begin min-env-determinism
959
+ //! :end-before: example-end min-env-determinism
960
+ //!
961
+ //! @endrst
962
+ //!
963
+ //! @tparam InputIteratorT
964
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
965
+ //!
966
+ //! @tparam OutputIteratorT
967
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
968
+ //!
969
+ //! @tparam NumItemsT
970
+ //! **[inferred]** Type of num_items
971
+ //!
972
+ //! @tparam EnvT
973
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
974
+ //!
975
+ //! @param[in] d_in
976
+ //! Pointer to the input sequence of data items
977
+ //!
978
+ //! @param[out] d_out
979
+ //! Pointer to the output aggregate
980
+ //!
981
+ //! @param[in] num_items
982
+ //! Total number of input items (i.e., length of ``d_in``)
983
+ //!
984
+ //! @param[in] env
985
+ //! @rst
986
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
987
+ //! @endrst
988
+ template <typename InputIteratorT,
989
+ typename OutputIteratorT,
990
+ typename NumItemsT,
991
+ typename EnvT = ::cuda::std::execution::env<>>
992
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
993
+ Min(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
994
+ {
995
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Min");
996
+
997
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
998
+ "Determinism should be used inside requires to have an effect.");
999
+ using requirements_t =
1000
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1001
+ using requested_determinism_t =
1002
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1003
+ _CUDA_EXEC::determinism::__get_determinism_t,
1004
+ _CUDA_EXEC::determinism::run_to_run_t>;
1005
+
1006
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1007
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1008
+ "gpu_to_gpu determinism is not supported");
1009
+
1010
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1011
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1012
+
1013
+ // Query relevant properties from the environment
1014
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1015
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1016
+
1017
+ void* d_temp_storage = nullptr;
1018
+ size_t temp_storage_bytes = 0;
1019
+
1020
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1021
+
1022
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1023
+
1024
+ using InitT = OutputT;
1025
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1026
+
1027
+ // Query the required temporary storage size
1028
+ cudaError_t error = reduce_impl<tuning_t>(
1029
+ d_temp_storage,
1030
+ temp_storage_bytes,
1031
+ d_in,
1032
+ d_out,
1033
+ num_items,
1034
+ ::cuda::minimum<>{},
1035
+ ::cuda::std::identity{},
1036
+ limits_t::max(),
1037
+ determinism_t{},
1038
+ stream.get());
1039
+ if (error != cudaSuccess)
1040
+ {
1041
+ return error;
1042
+ }
1043
+
1044
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1045
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1046
+ if (error != cudaSuccess)
1047
+ {
1048
+ return error;
1049
+ }
1050
+
1051
+ // Run the algorithm
1052
+ error = reduce_impl<tuning_t>(
1053
+ d_temp_storage,
1054
+ temp_storage_bytes,
1055
+ d_in,
1056
+ d_out,
1057
+ num_items,
1058
+ ::cuda::minimum<>{},
1059
+ ::cuda::std::identity{},
1060
+ limits_t::max(),
1061
+ determinism_t{},
1062
+ stream.get());
1063
+
1064
+ // Try to deallocate regardless of the error to avoid memory leaks
1065
+ cudaError_t deallocate_error =
1066
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1067
+
1068
+ if (error != cudaSuccess)
1069
+ {
1070
+ // Reduction error takes precedence over deallocation error since it happens first
1071
+ return error;
1072
+ }
1073
+
1074
+ return deallocate_error;
1075
+ }
1076
+
1077
+ //! @rst
1078
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1079
+ //!
1080
+ //! - The minimum is written to ``d_min_out``
1081
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1082
+ //! ``cuda::std::int64_t``.
1083
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1084
+ //! ``1`` is written to ``d_index_out``.
1085
+ //! - Does not support ``<`` operators that are non-commutative.
1086
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1087
+ //! (e.g., addition of floating point types) on the same GPU device.
1088
+ //! However, results for pseudo-associative reduction may be inconsistent
1089
+ //! from one device to a another device of a different compute-capability
1090
+ //! because CUB can employ different tile-sizing for different architectures.
1091
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1092
+ //! - @devicestorage
1093
+ //!
1094
+ //! Snippet
1095
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1096
+ //!
1097
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1098
+ //! of ``int`` data elements.
1099
+ //!
1100
+ //! .. code-block:: c++
1101
+ //!
1102
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1103
+ //! #include <cuda/std/cstdint>
1104
+ //!
1105
+ //! // Declare, allocate, and initialize device-accessible pointers
1106
+ //! // for input and output
1107
+ //! int num_items; // e.g., 7
1108
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1109
+ //! int *d_min_out; // memory for the minimum value
1110
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1111
+ //! ...
1112
+ //!
1113
+ //! // Determine temporary device storage requirements
1114
+ //! void *d_temp_storage = nullptr;
1115
+ //! size_t temp_storage_bytes = 0;
1116
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1117
+ //! num_items);
1118
+ //!
1119
+ //! // Allocate temporary storage
1120
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1121
+ //!
1122
+ //! // Run argmin-reduction
1123
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1124
+ //! num_items);
1125
+ //!
1126
+ //! // d_min_out <-- 0
1127
+ //! // d_index_out <-- 5
1128
+ //!
1129
+ //! @endrst
1130
+ //!
1131
+ //! @tparam InputIteratorT
1132
+ //! **[inferred]** Random-access input iterator type for reading input items
1133
+ //! (of some type `T`) @iterator
1134
+ //!
1135
+ //! @tparam ExtremumOutIteratorT
1136
+ //! **[inferred]** Output iterator type for recording minimum value
1137
+ //!
1138
+ //! @tparam IndexOutIteratorT
1139
+ //! **[inferred]** Output iterator type for recording index of the returned value
1140
+ //!
1141
+ //! @param[in] d_temp_storage
1142
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1143
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1144
+ //!
1145
+ //! @param[in,out] temp_storage_bytes
1146
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1147
+ //!
1148
+ //! @param[in] d_in
1149
+ //! Iterator to the input sequence of data items
1150
+ //!
1151
+ //! @param[out] d_min_out
1152
+ //! Iterator to which the minimum value is written
1153
+ //!
1154
+ //! @param[out] d_index_out
1155
+ //! Iterator to which the index of the returned value is written
1156
+ //!
1157
+ //! @param[in] num_items
1158
+ //! Total number of input items (i.e., length of ``d_in``)
1159
+ //!
1160
+ //! @param[in] stream
1161
+ //! @rst
1162
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1163
+ //! @endrst
1164
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1165
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
1166
+ void* d_temp_storage,
1167
+ size_t& temp_storage_bytes,
1168
+ InputIteratorT d_in,
1169
+ ExtremumOutIteratorT d_min_out,
1170
+ IndexOutIteratorT d_index_out,
1171
+ ::cuda::std::int64_t num_items,
1172
+ cudaStream_t stream = 0)
1173
+ {
1174
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1175
+
1176
+ // The input type
1177
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1178
+
1179
+ // Offset type used within the kernel and to index within one partition
1180
+ using PerPartitionOffsetT = int;
1181
+
1182
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1183
+ using GlobalOffsetT = ::cuda::std::int64_t;
1184
+
1185
+ // The value type used for the extremum
1186
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1187
+ using InitT = OutputExtremumT;
1188
+
1189
+ // Reduction operation
1190
+ using ReduceOpT = cub::ArgMin;
1191
+
1192
+ // Initial value
1193
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1194
+
1195
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1196
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1197
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1198
+
1199
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1200
+ InputIteratorT,
1201
+ decltype(out_it),
1202
+ PerPartitionOffsetT,
1203
+ GlobalOffsetT,
1204
+ ReduceOpT,
1205
+ InitT>::Dispatch(d_temp_storage,
1206
+ temp_storage_bytes,
1207
+ d_in,
1208
+ out_it,
1209
+ static_cast<GlobalOffsetT>(num_items),
1210
+ ReduceOpT{},
1211
+ initial_value,
1212
+ stream);
1213
+ }
1214
+
1215
+ //! @rst
1216
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1217
+ //!
1218
+ //! - The minimum is written to ``d_min_out``
1219
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1220
+ //! ``cuda::std::int64_t``.
1221
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1222
+ //! ``1`` is written to ``d_index_out``.
1223
+ //! - Does not support ``<`` operators that are non-commutative.
1224
+ //! - Provides determinism based on the environment's determinism requirements.
1225
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1226
+ //! as the `env` parameter.
1227
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1228
+ //!
1229
+ //! Snippet
1230
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1231
+ //!
1232
+ //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
1233
+ //!
1234
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1235
+ //! :language: c++
1236
+ //! :dedent:
1237
+ //! :start-after: example-begin argmin-env-determinism
1238
+ //! :end-before: example-end argmin-env-determinism
1239
+ //!
1240
+ //! @endrst
1241
+ //!
1242
+ //! @tparam InputIteratorT
1243
+ //! **[inferred]** Random-access input iterator type for reading input items
1244
+ //! (of some type `T`) @iterator
1245
+ //!
1246
+ //! @tparam ExtremumOutIteratorT
1247
+ //! **[inferred]** Output iterator type for recording minimum value
1248
+ //!
1249
+ //! @tparam IndexOutIteratorT
1250
+ //! **[inferred]** Output iterator type for recording index of the returned value
1251
+ //!
1252
+ //! @tparam EnvT
1253
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
1254
+ //!
1255
+ //! @param[in] d_in
1256
+ //! Iterator to the input sequence of data items
1257
+ //!
1258
+ //! @param[out] d_min_out
1259
+ //! Iterator to which the minimum value is written
1260
+ //!
1261
+ //! @param[out] d_index_out
1262
+ //! Iterator to which the index of the returned value is written
1263
+ //!
1264
+ //! @param[in] num_items
1265
+ //! Total number of input items (i.e., length of ``d_in``)
1266
+ //!
1267
+ //! @param[in] env
1268
+ //! @rst
1269
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1270
+ //! @endrst
1271
+ template <typename InputIteratorT,
1272
+ typename ExtremumOutIteratorT,
1273
+ typename IndexOutIteratorT,
1274
+ typename EnvT = ::cuda::std::execution::env<>>
1275
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1276
+ ArgMin(InputIteratorT d_in,
1277
+ ExtremumOutIteratorT d_min_out,
1278
+ IndexOutIteratorT d_index_out,
1279
+ ::cuda::std::int64_t num_items,
1280
+ EnvT env = {})
1281
+ {
1282
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
1283
+
1284
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1285
+ "Determinism should be used inside requires to have an effect.");
1286
+ using requirements_t =
1287
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1288
+ using requested_determinism_t =
1289
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1290
+ _CUDA_EXEC::determinism::__get_determinism_t,
1291
+ _CUDA_EXEC::determinism::run_to_run_t>;
1292
+
1293
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1294
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1295
+ "gpu_to_gpu determinism is not supported");
1296
+
1297
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1298
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1299
+
1300
+ // Query relevant properties from the environment
1301
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1302
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1303
+
1304
+ void* d_temp_storage = nullptr;
1305
+ size_t temp_storage_bytes = 0;
1306
+
1307
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1308
+
1309
+ // Reduction operation
1310
+ using ReduceOpT = cub::ArgMin;
1311
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1312
+ using PerPartitionOffsetT = int;
1313
+ using GlobalOffsetT = ::cuda::std::int64_t;
1314
+
1315
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1316
+ using InitT = OutputExtremumT;
1317
+
1318
+ // Initial value
1319
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1320
+
1321
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1322
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1323
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1324
+
1325
+ // Query the required temporary storage size
1326
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
1327
+ InputIteratorT,
1328
+ decltype(out_it),
1329
+ PerPartitionOffsetT,
1330
+ GlobalOffsetT,
1331
+ ReduceOpT,
1332
+ InitT>::Dispatch(d_temp_storage,
1333
+ temp_storage_bytes,
1334
+ d_in,
1335
+ out_it,
1336
+ static_cast<GlobalOffsetT>(num_items),
1337
+ ReduceOpT{},
1338
+ initial_value,
1339
+ stream.get());
1340
+ if (error != cudaSuccess)
1341
+ {
1342
+ return error;
1343
+ }
1344
+
1345
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1346
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1347
+ if (error != cudaSuccess)
1348
+ {
1349
+ return error;
1350
+ }
1351
+
1352
+ // Run the algorithm
1353
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
1354
+ InputIteratorT,
1355
+ decltype(out_it),
1356
+ PerPartitionOffsetT,
1357
+ GlobalOffsetT,
1358
+ ReduceOpT,
1359
+ InitT>::Dispatch(d_temp_storage,
1360
+ temp_storage_bytes,
1361
+ d_in,
1362
+ out_it,
1363
+ static_cast<GlobalOffsetT>(num_items),
1364
+ ReduceOpT{},
1365
+ initial_value,
1366
+ stream.get());
1367
+
1368
+ // Try to deallocate regardless of the error to avoid memory leaks
1369
+ cudaError_t deallocate_error =
1370
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1371
+
1372
+ if (error != cudaSuccess)
1373
+ {
1374
+ // Reduction error takes precedence over deallocation error since it happens first
1375
+ return error;
1376
+ }
1377
+
1378
+ return deallocate_error;
1379
+ }
1380
+
1381
+ //! @rst
1382
+ //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
1383
+ //!
1384
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1385
+ //! (assuming the value type of ``d_in`` is ``T``)
1386
+ //!
1387
+ //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
1388
+ //! - The ``{1, cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
1389
+ //!
1390
+ //! - Does not support ``<`` operators that are non-commutative.
1391
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1392
+ //! (e.g., addition of floating point types) on the same GPU device.
1393
+ //! However, results for pseudo-associative reduction may be inconsistent
1394
+ //! from one device to a another device of a different compute-capability
1395
+ //! because CUB can employ different tile-sizing for different architectures.
1396
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1397
+ //! - @devicestorage
1398
+ //!
1399
+ //! Snippet
1400
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1401
+ //!
1402
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1403
+ //! of ``int`` data elements.
1404
+ //!
1405
+ //! .. code-block:: c++
1406
+ //!
1407
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1408
+ //!
1409
+ //! // Declare, allocate, and initialize device-accessible pointers
1410
+ //! // for input and output
1411
+ //! int num_items; // e.g., 7
1412
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1413
+ //! KeyValuePair<int, int> *d_argmin; // e.g., [{-,-}]
1414
+ //! ...
1415
+ //!
1416
+ //! // Determine temporary device storage requirements
1417
+ //! void *d_temp_storage = nullptr;
1418
+ //! size_t temp_storage_bytes = 0;
1419
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1420
+ //!
1421
+ //! // Allocate temporary storage
1422
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1423
+ //!
1424
+ //! // Run argmin-reduction
1425
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1426
+ //!
1427
+ //! // d_argmin <-- [{5, 0}]
1428
+ //!
1429
+ //! @endrst
1430
+ //!
1431
+ //! @tparam InputIteratorT
1432
+ //! **[inferred]** Random-access input iterator type for reading input items
1433
+ //! (of some type `T`) @iterator
1434
+ //!
1435
+ //! @tparam OutputIteratorT
1436
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1437
+ //! (having value type ``cub::KeyValuePair<int, T>``) @iterator
1438
+ //!
1439
+ //! @param[in] d_temp_storage
1440
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1441
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1442
+ //!
1443
+ //! @param[in,out] temp_storage_bytes
1444
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1445
+ //!
1446
+ //! @param[in] d_in
1447
+ //! Pointer to the input sequence of data items
1448
+ //!
1449
+ //! @param[out] d_out
1450
+ //! Pointer to the output aggregate
1451
+ //!
1452
+ //! @param[in] num_items
1453
+ //! Total number of input items (i.e., length of ``d_in``)
1454
+ //!
1455
+ //! @param[in] stream
1456
+ //! @rst
1457
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1458
+ //! @endrst
1459
+ template <typename InputIteratorT, typename OutputIteratorT>
1460
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMin interface that takes two separate "
1461
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1462
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1463
+ ArgMin(void* d_temp_storage,
1464
+ size_t& temp_storage_bytes,
1465
+ InputIteratorT d_in,
1466
+ OutputIteratorT d_out,
1467
+ int num_items,
1468
+ cudaStream_t stream = 0)
1469
+ {
1470
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1471
+
1472
+ // Signed integer type for global offsets
1473
+ using OffsetT = int;
1474
+
1475
+ // The input type
1476
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1477
+
1478
+ // The output tuple type
1479
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1480
+
1481
+ using AccumT = OutputTupleT;
1482
+
1483
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1484
+
1485
+ // The output value type
1486
+ using OutputValueT = typename OutputTupleT::Value;
1487
+
1488
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1489
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1490
+
1491
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1492
+
1493
+ // Initial value
1494
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
1495
+
1496
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
1497
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
1498
+ }
1499
+
1500
+ //! @rst
1501
+ //! Computes a device-wide maximum using the greater-than (``>``) operator.
1502
+ //!
1503
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1504
+ //! - Does not support ``>`` operators that are non-commutative.
1505
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1506
+ //! (e.g., addition of floating point types) on the same GPU device.
1507
+ //! However, results for pseudo-associative reduction may be inconsistent
1508
+ //! from one device to a another device of a different compute-capability
1509
+ //! because CUB can employ different tile-sizing for different architectures.
1510
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1511
+ //! - @devicestorage
1512
+ //!
1513
+ //! Snippet
1514
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1515
+ //!
1516
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1517
+ //!
1518
+ //! .. code-block:: c++
1519
+ //!
1520
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1521
+ //!
1522
+ //! // Declare, allocate, and initialize device-accessible pointers
1523
+ //! // for input and output
1524
+ //! int num_items; // e.g., 7
1525
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1526
+ //! int *d_max; // e.g., [-]
1527
+ //! ...
1528
+ //!
1529
+ //! // Determine temporary device storage requirements
1530
+ //! void *d_temp_storage = nullptr;
1531
+ //! size_t temp_storage_bytes = 0;
1532
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1533
+ //!
1534
+ //! // Allocate temporary storage
1535
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1536
+ //!
1537
+ //! // Run max-reduction
1538
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1539
+ //!
1540
+ //! // d_max <-- [9]
1541
+ //!
1542
+ //! @endrst
1543
+ //!
1544
+ //! @tparam InputIteratorT
1545
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1546
+ //!
1547
+ //! @tparam OutputIteratorT
1548
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1549
+ //!
1550
+ //! @tparam NumItemsT
1551
+ //! **[inferred]** Type of num_items
1552
+ //!
1553
+ //! @param[in] d_temp_storage
1554
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1555
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1556
+ //!
1557
+ //! @param[in,out] temp_storage_bytes
1558
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1559
+ //!
1560
+ //! @param[in] d_in
1561
+ //! Pointer to the input sequence of data items
1562
+ //!
1563
+ //! @param[out] d_out
1564
+ //! Pointer to the output aggregate
1565
+ //!
1566
+ //! @param[in] num_items
1567
+ //! Total number of input items (i.e., length of ``d_in``)
1568
+ //!
1569
+ //! @param[in] stream
1570
+ //! @rst
1571
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1572
+ //! @endrst
1573
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1574
+ CUB_RUNTIME_FUNCTION static cudaError_t
1575
+ Max(void* d_temp_storage,
1576
+ size_t& temp_storage_bytes,
1577
+ InputIteratorT d_in,
1578
+ OutputIteratorT d_out,
1579
+ NumItemsT num_items,
1580
+ cudaStream_t stream = 0)
1581
+ {
1582
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
1583
+
1584
+ // Signed integer type for global offsets
1585
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1586
+ using InputT = detail::it_value_t<InputIteratorT>;
1587
+ using InitT = InputT;
1588
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1589
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1590
+ static_assert(limits_t::is_specialized,
1591
+ "cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
1592
+ "initial value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This "
1593
+ "is probably a bug and you should specialize cuda::std::numeric_limits. Define "
1594
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
1595
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1596
+
1597
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::maximum<>, InitT>::Dispatch(
1598
+ d_temp_storage,
1599
+ temp_storage_bytes,
1600
+ d_in,
1601
+ d_out,
1602
+ static_cast<OffsetT>(num_items),
1603
+ ::cuda::maximum<>{},
1604
+ limits_t::lowest(),
1605
+ stream);
1606
+ }
1607
+
1608
+ //! @rst
1609
+ //! Computes a device-wide maximum using the greater-than (``>``) operator. The result is written to the output
1610
+ //! iterator.
1611
+ //!
1612
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1613
+ //! - Provides determinism based on the environment's determinism requirements.
1614
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1615
+ //! as the `env` parameter.
1616
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1617
+ //!
1618
+ //! Snippet
1619
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1620
+ //!
1621
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1622
+ //!
1623
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1624
+ //! :language: c++
1625
+ //! :dedent:
1626
+ //! :start-after: example-begin max-env-determinism
1627
+ //! :end-before: example-end max-env-determinism
1628
+ //!
1629
+ //! @endrst
1630
+ //!
1631
+ //! @tparam InputIteratorT
1632
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1633
+ //!
1634
+ //! @tparam OutputIteratorT
1635
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1636
+ //!
1637
+ //! @tparam NumItemsT
1638
+ //! **[inferred]** Type of num_items
1639
+ //!
1640
+ //! @tparam EnvT
1641
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
1642
+ //!
1643
+ //! @param[in] d_in
1644
+ //! Pointer to the input sequence of data items
1645
+ //!
1646
+ //! @param[out] d_out
1647
+ //! Pointer to the output aggregate
1648
+ //!
1649
+ //! @param[in] num_items
1650
+ //! Total number of input items (i.e., length of ``d_in``)
1651
+ //!
1652
+ //! @param[in] env
1653
+ //! @rst
1654
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1655
+ //! @endrst
1656
+ template <typename InputIteratorT,
1657
+ typename OutputIteratorT,
1658
+ typename NumItemsT,
1659
+ typename EnvT = ::cuda::std::execution::env<>>
1660
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1661
+ Max(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1662
+ {
1663
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Max");
1664
+
1665
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1666
+ "Determinism should be used inside requires to have an effect.");
1667
+ using requirements_t =
1668
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1669
+ using requested_determinism_t =
1670
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1671
+ _CUDA_EXEC::determinism::__get_determinism_t,
1672
+ _CUDA_EXEC::determinism::run_to_run_t>;
1673
+
1674
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1675
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1676
+ "gpu_to_gpu determinism is not supported");
1677
+
1678
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1679
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1680
+
1681
+ // Query relevant properties from the environment
1682
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1683
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1684
+
1685
+ void* d_temp_storage = nullptr;
1686
+ size_t temp_storage_bytes = 0;
1687
+
1688
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1689
+
1690
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1691
+
1692
+ using InitT = OutputT;
1693
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1694
+
1695
+ // Query the required temporary storage size
1696
+ cudaError_t error = reduce_impl<tuning_t>(
1697
+ d_temp_storage,
1698
+ temp_storage_bytes,
1699
+ d_in,
1700
+ d_out,
1701
+ num_items,
1702
+ ::cuda::maximum<>{},
1703
+ ::cuda::std::identity{},
1704
+ limits_t::lowest(),
1705
+ determinism_t{},
1706
+ stream.get());
1707
+ if (error != cudaSuccess)
1708
+ {
1709
+ return error;
1710
+ }
1711
+
1712
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1713
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1714
+ if (error != cudaSuccess)
1715
+ {
1716
+ return error;
1717
+ }
1718
+
1719
+ // Run the algorithm
1720
+ error = reduce_impl<tuning_t>(
1721
+ d_temp_storage,
1722
+ temp_storage_bytes,
1723
+ d_in,
1724
+ d_out,
1725
+ num_items,
1726
+ ::cuda::maximum<>{},
1727
+ ::cuda::std::identity{},
1728
+ limits_t::lowest(),
1729
+ determinism_t{},
1730
+ stream.get());
1731
+
1732
+ // Try to deallocate regardless of the error to avoid memory leaks
1733
+ cudaError_t deallocate_error =
1734
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1735
+
1736
+ if (error != cudaSuccess)
1737
+ {
1738
+ // Reduction error takes precedence over deallocation error since it happens first
1739
+ return error;
1740
+ }
1741
+
1742
+ return deallocate_error;
1743
+ }
1744
+
1745
+ //! @rst
1746
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1747
+ //! item.
1748
+ //!
1749
+ //! - The maximum is written to ``d_max_out``
1750
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1751
+ //! ``cuda::std::int64_t``.
1752
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out`` and the index
1753
+ //! ``1`` is written to ``d_index_out``.
1754
+ //! - Does not support ``>`` operators that are non-commutative.
1755
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1756
+ //! (e.g., addition of floating point types) on the same GPU device.
1757
+ //! However, results for pseudo-associative reduction may be inconsistent
1758
+ //! from one device to a another device of a different compute-capability
1759
+ //! because CUB can employ different tile-sizing for different architectures.
1760
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1761
+ //! - @devicestorage
1762
+ //!
1763
+ //! Snippet
1764
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1765
+ //!
1766
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1767
+ //! of `int` data elements.
1768
+ //!
1769
+ //! .. code-block:: c++
1770
+ //!
1771
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1772
+ //! #include <cuda/std/cstdint>
1773
+ //!
1774
+ //! // Declare, allocate, and initialize device-accessible pointers
1775
+ //! // for input and output
1776
+ //! int num_items; // e.g., 7
1777
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1778
+ //! int *d_max_out; // memory for the maximum value
1779
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1780
+ //! ...
1781
+ //!
1782
+ //! // Determine temporary device storage requirements
1783
+ //! void *d_temp_storage = nullptr;
1784
+ //! size_t temp_storage_bytes = 0;
1785
+ //! cub::DeviceReduce::ArgMax(
1786
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1787
+ //!
1788
+ //! // Allocate temporary storage
1789
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1790
+ //!
1791
+ //! // Run argmax-reduction
1792
+ //! cub::DeviceReduce::ArgMax(
1793
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1794
+ //!
1795
+ //! // d_max_out <-- 9
1796
+ //! // d_index_out <-- 6
1797
+ //!
1798
+ //! @endrst
1799
+ //!
1800
+ //! @tparam InputIteratorT
1801
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1802
+ //!
1803
+ //! @tparam ExtremumOutIteratorT
1804
+ //! **[inferred]** Output iterator type for recording maximum value
1805
+ //!
1806
+ //! @tparam IndexOutIteratorT
1807
+ //! **[inferred]** Output iterator type for recording index of the returned value
1808
+ //!
1809
+ //! @param[in] d_temp_storage
1810
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1811
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1812
+ //!
1813
+ //! @param[in,out] temp_storage_bytes
1814
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1815
+ //!
1816
+ //! @param[in] d_in
1817
+ //! Pointer to the input sequence of data items
1818
+ //!
1819
+ //! @param[out] d_max_out
1820
+ //! Iterator to which the maximum value is written
1821
+ //!
1822
+ //! @param[out] d_index_out
1823
+ //! Iterator to which the index of the returned value is written
1824
+ //!
1825
+ //! @param[in] num_items
1826
+ //! Total number of input items (i.e., length of ``d_in``)
1827
+ //!
1828
+ //! @param[in] stream
1829
+ //! @rst
1830
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1831
+ //! @endrst
1832
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1833
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
1834
+ void* d_temp_storage,
1835
+ size_t& temp_storage_bytes,
1836
+ InputIteratorT d_in,
1837
+ ExtremumOutIteratorT d_max_out,
1838
+ IndexOutIteratorT d_index_out,
1839
+ ::cuda::std::int64_t num_items,
1840
+ cudaStream_t stream = 0)
1841
+ {
1842
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1843
+
1844
+ // The input type
1845
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1846
+
1847
+ // Offset type used within the kernel and to index within one partition
1848
+ using PerPartitionOffsetT = int;
1849
+
1850
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1851
+ using GlobalOffsetT = ::cuda::std::int64_t;
1852
+
1853
+ // The value type used for the extremum
1854
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1855
+ using InitT = OutputExtremumT;
1856
+
1857
+ // Reduction operation
1858
+ using ReduceOpT = cub::ArgMax;
1859
+
1860
+ // Initial value
1861
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1862
+
1863
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1864
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1865
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1866
+
1867
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1868
+ InputIteratorT,
1869
+ decltype(out_it),
1870
+ PerPartitionOffsetT,
1871
+ GlobalOffsetT,
1872
+ ReduceOpT,
1873
+ InitT>::Dispatch(d_temp_storage,
1874
+ temp_storage_bytes,
1875
+ d_in,
1876
+ out_it,
1877
+ static_cast<GlobalOffsetT>(num_items),
1878
+ ReduceOpT{},
1879
+ initial_value,
1880
+ stream);
1881
+ }
1882
+
1883
+ //! @rst
1884
+ //! Finds the first device-wide maximum using the greater-than (``>``)
1885
+ //! operator, also returning the index of that item
1886
+ //!
1887
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1888
+ //! (assuming the value type of ``d_in`` is ``T``)
1889
+ //!
1890
+ //! - The maximum is written to ``d_out.value`` and its offset in the input
1891
+ //! array is written to ``d_out.key``.
1892
+ //! - The ``{1, cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
1893
+ //!
1894
+ //! - Does not support ``>`` operators that are non-commutative.
1895
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1896
+ //! (e.g., addition of floating point types) on the same GPU device.
1897
+ //! However, results for pseudo-associative reduction may be inconsistent
1898
+ //! from one device to a another device of a different compute-capability
1899
+ //! because CUB can employ different tile-sizing for different architectures.
1900
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1901
+ //! - @devicestorage
1902
+ //!
1903
+ //! Snippet
1904
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1905
+ //!
1906
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1907
+ //! of `int` data elements.
1908
+ //!
1909
+ //! .. code-block:: c++
1910
+ //!
1911
+ //! #include <cub/cub.cuh>
1912
+ //! // or equivalently <cub/device/device_reduce.cuh>
1913
+ //!
1914
+ //! // Declare, allocate, and initialize device-accessible pointers
1915
+ //! // for input and output
1916
+ //! int num_items; // e.g., 7
1917
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1918
+ //! KeyValuePair<int, int> *d_argmax; // e.g., [{-,-}]
1919
+ //! ...
1920
+ //!
1921
+ //! // Determine temporary device storage requirements
1922
+ //! void *d_temp_storage = nullptr;
1923
+ //! size_t temp_storage_bytes = 0;
1924
+ //! cub::DeviceReduce::ArgMax(
1925
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1926
+ //!
1927
+ //! // Allocate temporary storage
1928
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1929
+ //!
1930
+ //! // Run argmax-reduction
1931
+ //! cub::DeviceReduce::ArgMax(
1932
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1933
+ //!
1934
+ //! // d_argmax <-- [{6, 9}]
1935
+ //!
1936
+ //! @endrst
1937
+ //!
1938
+ //! @tparam InputIteratorT
1939
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1940
+ //!
1941
+ //! @tparam OutputIteratorT
1942
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1943
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1944
+ //!
1945
+ //! @param[in] d_temp_storage
1946
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1947
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1948
+ //!
1949
+ //! @param[in,out] temp_storage_bytes
1950
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1951
+ //!
1952
+ //! @param[in] d_in
1953
+ //! Pointer to the input sequence of data items
1954
+ //!
1955
+ //! @param[out] d_out
1956
+ //! Pointer to the output aggregate
1957
+ //!
1958
+ //! @param[in] num_items
1959
+ //! Total number of input items (i.e., length of ``d_in``)
1960
+ //!
1961
+ //! @param[in] stream
1962
+ //! @rst
1963
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1964
+ //! @endrst
1965
+ template <typename InputIteratorT, typename OutputIteratorT>
1966
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface that takes two separate "
1967
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1968
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1969
+ ArgMax(void* d_temp_storage,
1970
+ size_t& temp_storage_bytes,
1971
+ InputIteratorT d_in,
1972
+ OutputIteratorT d_out,
1973
+ int num_items,
1974
+ cudaStream_t stream = 0)
1975
+ {
1976
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1977
+
1978
+ // Signed integer type for global offsets
1979
+ using OffsetT = int;
1980
+
1981
+ // The input type
1982
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1983
+
1984
+ // The output tuple type
1985
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1986
+
1987
+ using AccumT = OutputTupleT;
1988
+
1989
+ // The output value type
1990
+ using OutputValueT = typename OutputTupleT::Value;
1991
+
1992
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1993
+
1994
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1995
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1996
+
1997
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1998
+
1999
+ // Initial value
2000
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
2001
+
2002
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
2003
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
2004
+ }
2005
+
2006
+ //! @rst
2007
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
2008
+ //! item.
2009
+ //!
2010
+ //! - The maximum is written to ``d_max_out``
2011
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
2012
+ //! ``cuda::std::int64_t``.
2013
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::lowest()}`` is written to ``d_max_out`` and the index
2014
+ //! ``1`` is written to ``d_index_out``.
2015
+ //! - Does not support ``>`` operators that are non-commutative.
2016
+ //! - Provides determinism based on the environment's determinism requirements.
2017
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
2018
+ //! as the `env` parameter.
2019
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_max_out`` nor ``d_index_out``.
2020
+ //!
2021
+ //! Snippet
2022
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2023
+ //!
2024
+ //! The code snippet below illustrates the argmax-reduction of a device vector of ``int`` data elements.
2025
+ //!
2026
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
2027
+ //! :language: c++
2028
+ //! :dedent:
2029
+ //! :start-after: example-begin argmax-env-determinism
2030
+ //! :end-before: example-end argmax-env-determinism
2031
+ //!
2032
+ //! @endrst
2033
+ //!
2034
+ //! @tparam InputIteratorT
2035
+ //! **[inferred]** Random-access input iterator type for reading input items
2036
+ //! (of some type `T`) @iterator
2037
+ //!
2038
+ //! @tparam ExtremumOutIteratorT
2039
+ //! **[inferred]** Output iterator type for recording maximum value
2040
+ //!
2041
+ //! @tparam IndexOutIteratorT
2042
+ //! **[inferred]** Output iterator type for recording index of the returned value
2043
+ //!
2044
+ //! @tparam EnvT
2045
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
2046
+ //!
2047
+ //! @param[in] d_in
2048
+ //! Iterator to the input sequence of data items
2049
+ //!
2050
+ //! @param[out] d_max_out
2051
+ //! Iterator to which the maximum value is written
2052
+ //!
2053
+ //! @param[out] d_index_out
2054
+ //! Iterator to which the index of the returned value is written
2055
+ //!
2056
+ //! @param[in] num_items
2057
+ //! Total number of input items (i.e., length of ``d_in``)
2058
+ //!
2059
+ //! @param[in] env
2060
+ //! @rst
2061
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
2062
+ //! @endrst
2063
+ template <typename InputIteratorT,
2064
+ typename ExtremumOutIteratorT,
2065
+ typename IndexOutIteratorT,
2066
+ typename EnvT = ::cuda::std::execution::env<>>
2067
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
2068
+ ArgMax(InputIteratorT d_in,
2069
+ ExtremumOutIteratorT d_max_out,
2070
+ IndexOutIteratorT d_index_out,
2071
+ ::cuda::std::int64_t num_items,
2072
+ EnvT env = {})
2073
+ {
2074
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
2075
+
2076
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
2077
+ "Determinism should be used inside requires to have an effect.");
2078
+ using requirements_t =
2079
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
2080
+ using requested_determinism_t =
2081
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
2082
+ _CUDA_EXEC::determinism::__get_determinism_t,
2083
+ _CUDA_EXEC::determinism::run_to_run_t>;
2084
+
2085
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
2086
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
2087
+ "gpu_to_gpu determinism is not supported");
2088
+
2089
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
2090
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
2091
+
2092
+ // Query relevant properties from the environment
2093
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
2094
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
2095
+
2096
+ void* d_temp_storage = nullptr;
2097
+ size_t temp_storage_bytes = 0;
2098
+
2099
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
2100
+
2101
+ // Reduction operation
2102
+ using ReduceOpT = cub::ArgMax;
2103
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
2104
+ using PerPartitionOffsetT = int;
2105
+ using GlobalOffsetT = ::cuda::std::int64_t;
2106
+
2107
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
2108
+ using InitT = OutputExtremumT;
2109
+
2110
+ // Initial value
2111
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
2112
+
2113
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
2114
+ auto out_it = ::cuda::make_tabulate_output_iterator(
2115
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
2116
+
2117
+ // Query the required temporary storage size
2118
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
2119
+ InputIteratorT,
2120
+ decltype(out_it),
2121
+ PerPartitionOffsetT,
2122
+ GlobalOffsetT,
2123
+ ReduceOpT,
2124
+ InitT>::Dispatch(d_temp_storage,
2125
+ temp_storage_bytes,
2126
+ d_in,
2127
+ out_it,
2128
+ static_cast<GlobalOffsetT>(num_items),
2129
+ ReduceOpT{},
2130
+ initial_value,
2131
+ stream.get());
2132
+ if (error != cudaSuccess)
2133
+ {
2134
+ return error;
2135
+ }
2136
+
2137
+ // TODO(gevtushenko): use uninitialized buffer when it's available
2138
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
2139
+ if (error != cudaSuccess)
2140
+ {
2141
+ return error;
2142
+ }
2143
+
2144
+ // Run the algorithm
2145
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
2146
+ InputIteratorT,
2147
+ decltype(out_it),
2148
+ PerPartitionOffsetT,
2149
+ GlobalOffsetT,
2150
+ ReduceOpT,
2151
+ InitT>::Dispatch(d_temp_storage,
2152
+ temp_storage_bytes,
2153
+ d_in,
2154
+ out_it,
2155
+ static_cast<GlobalOffsetT>(num_items),
2156
+ ReduceOpT{},
2157
+ initial_value,
2158
+ stream.get());
2159
+
2160
+ // Try to deallocate regardless of the error to avoid memory leaks
2161
+ cudaError_t deallocate_error =
2162
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
2163
+
2164
+ if (error != cudaSuccess)
2165
+ {
2166
+ // Reduction error takes precedence over deallocation error since it happens first
2167
+ return error;
2168
+ }
2169
+
2170
+ return deallocate_error;
2171
+ }
2172
+
2173
+ //! @rst
2174
+ //! Fuses transform and reduce operations
2175
+ //!
2176
+ //! - Does not support binary reduction operators that are non-commutative.
2177
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2178
+ //! (e.g., addition of floating point types) on the same GPU device.
2179
+ //! However, results for pseudo-associative reduction may be inconsistent
2180
+ //! from one device to a another device of a different compute-capability
2181
+ //! because CUB can employ different tile-sizing for different architectures.
2182
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
2183
+ //! - @devicestorage
2184
+ //!
2185
+ //! Snippet
2186
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2187
+ //!
2188
+ //! The code snippet below illustrates a user-defined min-reduction of a
2189
+ //! device vector of `int` data elements.
2190
+ //!
2191
+ //! .. code-block:: c++
2192
+ //!
2193
+ //! #include <cub/cub.cuh>
2194
+ //! // or equivalently <cub/device/device_reduce.cuh>
2195
+ //!
2196
+ //! thrust::device_vector<int> in = { 1, 2, 3, 4 };
2197
+ //! thrust::device_vector<int> out(1);
2198
+ //!
2199
+ //! size_t temp_storage_bytes = 0;
2200
+ //! uint8_t *d_temp_storage = nullptr;
2201
+ //!
2202
+ //! const int init = 42;
2203
+ //!
2204
+ //! cub::DeviceReduce::TransformReduce(
2205
+ //! d_temp_storage,
2206
+ //! temp_storage_bytes,
2207
+ //! in.begin(),
2208
+ //! out.begin(),
2209
+ //! in.size(),
2210
+ //! cuda::std::plus<>{},
2211
+ //! square_t{},
2212
+ //! init);
2213
+ //!
2214
+ //! thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
2215
+ //! d_temp_storage = temp_storage.data().get();
2216
+ //!
2217
+ //! cub::DeviceReduce::TransformReduce(
2218
+ //! d_temp_storage,
2219
+ //! temp_storage_bytes,
2220
+ //! in.begin(),
2221
+ //! out.begin(),
2222
+ //! in.size(),
2223
+ //! cuda::std::plus<>{},
2224
+ //! square_t{},
2225
+ //! init);
2226
+ //!
2227
+ //! // out[0] <-- 72
2228
+ //!
2229
+ //! @endrst
2230
+ //!
2231
+ //! @tparam InputIteratorT
2232
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
2233
+ //!
2234
+ //! @tparam OutputIteratorT
2235
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
2236
+ //!
2237
+ //! @tparam ReductionOpT
2238
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2239
+ //!
2240
+ //! @tparam TransformOpT
2241
+ //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
2242
+ //!
2243
+ //! @tparam T
2244
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
2245
+ //!
2246
+ //! @tparam NumItemsT
2247
+ //! **[inferred]** Type of num_items
2248
+ //!
2249
+ //! @param[in] d_temp_storage
2250
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2251
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2252
+ //!
2253
+ //! @param[in,out] temp_storage_bytes
2254
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2255
+ //!
2256
+ //! @param[in] d_in
2257
+ //! Pointer to the input sequence of data items
2258
+ //!
2259
+ //! @param[out] d_out
2260
+ //! Pointer to the output aggregate
2261
+ //!
2262
+ //! @param[in] num_items
2263
+ //! Total number of input items (i.e., length of ``d_in``)
2264
+ //!
2265
+ //! @param[in] reduction_op
2266
+ //! Binary reduction functor
2267
+ //!
2268
+ //! @param[in] transform_op
2269
+ //! Unary transform functor
2270
+ //!
2271
+ //! @param[in] init
2272
+ //! Initial value of the reduction
2273
+ //!
2274
+ //! @param[in] stream
2275
+ //! @rst
2276
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2277
+ //! @endrst
2278
+ template <typename InputIteratorT,
2279
+ typename OutputIteratorT,
2280
+ typename ReductionOpT,
2281
+ typename TransformOpT,
2282
+ typename T,
2283
+ typename NumItemsT>
2284
+ CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
2285
+ void* d_temp_storage,
2286
+ size_t& temp_storage_bytes,
2287
+ InputIteratorT d_in,
2288
+ OutputIteratorT d_out,
2289
+ NumItemsT num_items,
2290
+ ReductionOpT reduction_op,
2291
+ TransformOpT transform_op,
2292
+ T init,
2293
+ cudaStream_t stream = 0)
2294
+ {
2295
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
2296
+
2297
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2298
+
2299
+ return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
2300
+ d_temp_storage,
2301
+ temp_storage_bytes,
2302
+ d_in,
2303
+ d_out,
2304
+ static_cast<OffsetT>(num_items),
2305
+ reduction_op,
2306
+ init,
2307
+ stream,
2308
+ transform_op);
2309
+ }
2310
+
2311
+ //! @rst
2312
+ //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
2313
+ //!
2314
+ //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
2315
+ //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
2316
+ //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
2317
+ //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
2318
+ //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
2319
+ //!
2320
+ //! - The ``==`` equality operator is used to determine whether keys are equivalent
2321
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2322
+ //! (e.g., addition of floating point types) on the same GPU device.
2323
+ //! However, results for pseudo-associative reduction may be inconsistent
2324
+ //! from one device to a another device of a different compute-capability
2325
+ //! because CUB can employ different tile-sizing for different architectures.
2326
+ //! - Let ``out`` be any of
2327
+ //! ``[d_unique_out, d_unique_out + *d_num_runs_out)``
2328
+ //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
2329
+ //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
2330
+ //! ``[d_keys_in, d_keys_in + num_items)``,
2331
+ //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
2332
+ //! - @devicestorage
2333
+ //!
2334
+ //! Snippet
2335
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2336
+ //!
2337
+ //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
2338
+ //! associated ``int`` keys.
2339
+ //!
2340
+ //! .. code-block:: c++
2341
+ //!
2342
+ //! #include <cub/cub.cuh>
2343
+ //! // or equivalently <cub/device/device_reduce.cuh>
2344
+ //!
2345
+ //! // CustomMin functor
2346
+ //! struct CustomMin
2347
+ //! {
2348
+ //! template <typename T>
2349
+ //! __device__ __forceinline__
2350
+ //! T operator()(const T &a, const T &b) const {
2351
+ //! return (b < a) ? b : a;
2352
+ //! }
2353
+ //! };
2354
+ //!
2355
+ //! // Declare, allocate, and initialize device-accessible pointers
2356
+ //! // for input and output
2357
+ //! int num_items; // e.g., 8
2358
+ //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
2359
+ //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
2360
+ //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
2361
+ //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
2362
+ //! int *d_num_runs_out; // e.g., [-]
2363
+ //! CustomMin reduction_op;
2364
+ //! ...
2365
+ //!
2366
+ //! // Determine temporary device storage requirements
2367
+ //! void *d_temp_storage = nullptr;
2368
+ //! size_t temp_storage_bytes = 0;
2369
+ //! cub::DeviceReduce::ReduceByKey(
2370
+ //! d_temp_storage, temp_storage_bytes,
2371
+ //! d_keys_in, d_unique_out, d_values_in,
2372
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2373
+ //!
2374
+ //! // Allocate temporary storage
2375
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2376
+ //!
2377
+ //! // Run reduce-by-key
2378
+ //! cub::DeviceReduce::ReduceByKey(
2379
+ //! d_temp_storage, temp_storage_bytes,
2380
+ //! d_keys_in, d_unique_out, d_values_in,
2381
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2382
+ //!
2383
+ //! // d_unique_out <-- [0, 2, 9, 5, 8]
2384
+ //! // d_aggregates_out <-- [0, 1, 6, 2, 4]
2385
+ //! // d_num_runs_out <-- [5]
2386
+ //!
2387
+ //! @endrst
2388
+ //!
2389
+ //! @tparam KeysInputIteratorT
2390
+ //! **[inferred]** Random-access input iterator type for reading input keys @iterator
2391
+ //!
2392
+ //! @tparam UniqueOutputIteratorT
2393
+ //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator
2394
+ //!
2395
+ //! @tparam ValuesInputIteratorT
2396
+ //! **[inferred]** Random-access input iterator type for reading input values @iterator
2397
+ //!
2398
+ //! @tparam AggregatesOutputIterator
2399
+ //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
2400
+ //!
2401
+ //! @tparam NumRunsOutputIteratorT
2402
+ //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator
2403
+ //!
2404
+ //! @tparam ReductionOpT
2405
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2406
+ //!
2407
+ //! @tparam NumItemsT
2408
+ //! **[inferred]** Type of num_items
2409
+ //!
2410
+ //! @param[in] d_temp_storage
2411
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2412
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2413
+ //!
2414
+ //! @param[in,out] temp_storage_bytes
2415
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2416
+ //!
2417
+ //! @param[in] d_keys_in
2418
+ //! Pointer to the input sequence of keys
2419
+ //!
2420
+ //! @param[out] d_unique_out
2421
+ //! Pointer to the output sequence of unique keys (one key per run)
2422
+ //!
2423
+ //! @param[in] d_values_in
2424
+ //! Pointer to the input sequence of corresponding values
2425
+ //!
2426
+ //! @param[out] d_aggregates_out
2427
+ //! Pointer to the output sequence of value aggregates
2428
+ //! (one aggregate per run)
2429
+ //!
2430
+ //! @param[out] d_num_runs_out
2431
+ //! Pointer to total number of runs encountered
2432
+ //! (i.e., the length of ``d_unique_out``)
2433
+ //!
2434
+ //! @param[in] reduction_op
2435
+ //! Binary reduction functor
2436
+ //!
2437
+ //! @param[in] num_items
2438
+ //! Total number of associated key+value pairs
2439
+ //! (i.e., the length of ``d_in_keys`` and ``d_in_values``)
2440
+ //!
2441
+ //! @param[in] stream
2442
+ //! @rst
2443
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2444
+ //! @endrst
2445
+ template <typename KeysInputIteratorT,
2446
+ typename UniqueOutputIteratorT,
2447
+ typename ValuesInputIteratorT,
2448
+ typename AggregatesOutputIteratorT,
2449
+ typename NumRunsOutputIteratorT,
2450
+ typename ReductionOpT,
2451
+ typename NumItemsT>
2452
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
2453
+ void* d_temp_storage,
2454
+ size_t& temp_storage_bytes,
2455
+ KeysInputIteratorT d_keys_in,
2456
+ UniqueOutputIteratorT d_unique_out,
2457
+ ValuesInputIteratorT d_values_in,
2458
+ AggregatesOutputIteratorT d_aggregates_out,
2459
+ NumRunsOutputIteratorT d_num_runs_out,
2460
+ ReductionOpT reduction_op,
2461
+ NumItemsT num_items,
2462
+ cudaStream_t stream = 0)
2463
+ {
2464
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
2465
+
2466
+ // Signed integer type for global offsets
2467
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2468
+
2469
+ // FlagT iterator type (not used)
2470
+
2471
+ // Selection op (not used)
2472
+
2473
+ // Default == operator
2474
+ using EqualityOp = ::cuda::std::equal_to<>;
2475
+
2476
+ return DispatchReduceByKey<
2477
+ KeysInputIteratorT,
2478
+ UniqueOutputIteratorT,
2479
+ ValuesInputIteratorT,
2480
+ AggregatesOutputIteratorT,
2481
+ NumRunsOutputIteratorT,
2482
+ EqualityOp,
2483
+ ReductionOpT,
2484
+ OffsetT>::Dispatch(d_temp_storage,
2485
+ temp_storage_bytes,
2486
+ d_keys_in,
2487
+ d_unique_out,
2488
+ d_values_in,
2489
+ d_aggregates_out,
2490
+ d_num_runs_out,
2491
+ EqualityOp(),
2492
+ reduction_op,
2493
+ static_cast<OffsetT>(num_items),
2494
+ stream);
2495
+ }
2496
+ };
2497
+ CUB_NAMESPACE_END