cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2518 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/detail/device_memory_resource.cuh>
47
+ #include <cub/detail/temporary_storage.cuh>
48
+ #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
49
+ #include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
50
+ #include <cub/device/dispatch/dispatch_reduce_nondeterministic.cuh>
51
+ #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
52
+ #include <cub/thread/thread_operators.cuh>
53
+ #include <cub/util_type.cuh>
54
+
55
+ #include <cuda/__execution/determinism.h>
56
+ #include <cuda/__execution/require.h>
57
+ #include <cuda/__execution/tune.h>
58
+ #include <cuda/__functional/maximum.h>
59
+ #include <cuda/__functional/minimum.h>
60
+ #include <cuda/__iterator/tabulate_output_iterator.h>
61
+ #include <cuda/__memory_resource/get_memory_resource.h>
62
+ #include <cuda/__stream/get_stream.h>
63
+ #include <cuda/__stream/stream_ref.h>
64
+ #include <cuda/std/__execution/env.h>
65
+ #include <cuda/std/__functional/identity.h>
66
+ #include <cuda/std/__functional/invoke.h>
67
+ #include <cuda/std/__functional/operations.h>
68
+ #include <cuda/std/__type_traits/conditional.h>
69
+ #include <cuda/std/__type_traits/is_integral.h>
70
+ #include <cuda/std/__type_traits/is_same.h>
71
+ #include <cuda/std/cstdint>
72
+ #include <cuda/std/limits>
73
+
74
+ CUB_NAMESPACE_BEGIN
75
+
76
+ namespace detail
77
+ {
78
+
79
+ template <typename DeterminismT>
80
+ inline constexpr bool is_non_deterministic_v =
81
+ ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
82
+
83
+ namespace reduce
84
+ {
85
+
86
+ struct get_tuning_query_t
87
+ {};
88
+
89
+ template <class Derived>
90
+ struct tuning
91
+ {
92
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
93
+ {
94
+ return static_cast<const Derived&>(*this);
95
+ }
96
+ };
97
+
98
+ struct default_tuning : tuning<default_tuning>
99
+ {
100
+ template <class AccumT, class Offset, class OpT>
101
+ using fn = policy_hub<AccumT, Offset, OpT>;
102
+ };
103
+
104
+ struct default_rfa_tuning : tuning<default_tuning>
105
+ {
106
+ template <class AccumT, class Offset, class OpT>
107
+ using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
108
+ };
109
+
110
+ template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
111
+ struct unzip_and_write_arg_extremum_op
112
+ {
113
+ ExtremumOutIteratorT result_out_it;
114
+ IndexOutIteratorT index_out_it;
115
+
116
+ template <typename IndexT, typename KeyValuePairT>
117
+ _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
118
+ {
119
+ *result_out_it = reduced_result.value;
120
+ *index_out_it = reduced_result.key;
121
+ }
122
+ };
123
+ } // namespace reduce
124
+ } // namespace detail
125
+
126
+ //! @rst
127
+ //! DeviceReduce provides device-wide, parallel operations for computing
128
+ //! a reduction across a sequence of data items residing within
129
+ //! device-accessible memory.
130
+ //!
131
+ //! .. image:: ../../img/reduce_logo.png
132
+ //! :align: center
133
+ //!
134
+ //! Overview
135
+ //! ====================================
136
+ //!
137
+ //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
138
+ //! (or *fold*) uses a binary combining operator to compute a single aggregate
139
+ //! from a sequence of input elements.
140
+ //!
141
+ //! Usage Considerations
142
+ //! ====================================
143
+ //!
144
+ //! @cdp_class{DeviceReduce}
145
+ //!
146
+ //! Performance
147
+ //! ====================================
148
+ //!
149
+ //! @linear_performance{reduction, reduce-by-key, and run-length encode}
150
+ //!
151
+ //! @endrst
152
+ struct DeviceReduce
153
+ {
154
+ private:
155
+ template <typename TuningEnvT,
156
+ typename InputIteratorT,
157
+ typename OutputIteratorT,
158
+ typename ReductionOpT,
159
+ typename TransformOpT,
160
+ typename T,
161
+ typename NumItemsT,
162
+ ::cuda::execution::determinism::__determinism_t Determinism>
163
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
164
+ void* d_temp_storage,
165
+ size_t& temp_storage_bytes,
166
+ InputIteratorT d_in,
167
+ OutputIteratorT d_out,
168
+ NumItemsT num_items,
169
+ ReductionOpT reduction_op,
170
+ TransformOpT transform_op,
171
+ T init,
172
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
173
+ cudaStream_t stream)
174
+ {
175
+ using offset_t = detail::choose_offset_t<NumItemsT>;
176
+ using reduce_tuning_t = ::cuda::std::execution::
177
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
178
+
179
+ using accum_t = ::cuda::std::
180
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
181
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
182
+
183
+ using dispatch_t =
184
+ DispatchTransformReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, TransformOpT, T, accum_t, policy_t>;
185
+
186
+ return dispatch_t::Dispatch(
187
+ d_temp_storage,
188
+ temp_storage_bytes,
189
+ d_in,
190
+ d_out,
191
+ static_cast<offset_t>(num_items),
192
+ reduction_op,
193
+ init,
194
+ stream,
195
+ transform_op);
196
+ }
197
+
198
+ template <typename TuningEnvT,
199
+ typename InputIteratorT,
200
+ typename OutputIteratorT,
201
+ typename ReductionOpT,
202
+ typename TransformOpT,
203
+ typename T,
204
+ typename NumItemsT>
205
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
206
+ void* d_temp_storage,
207
+ size_t& temp_storage_bytes,
208
+ InputIteratorT d_in,
209
+ OutputIteratorT d_out,
210
+ NumItemsT num_items,
211
+ ReductionOpT,
212
+ TransformOpT transform_op,
213
+ T init,
214
+ ::cuda::execution::determinism::gpu_to_gpu_t,
215
+ cudaStream_t stream)
216
+ {
217
+ using offset_t = detail::choose_offset_t<NumItemsT>;
218
+
219
+ using reduce_tuning_t = ::cuda::std::execution::
220
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
221
+
222
+ using accum_t = ::cuda::std::
223
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
224
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
225
+ using dispatch_t =
226
+ detail::DispatchReduceDeterministic<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>;
227
+
228
+ return dispatch_t::Dispatch(
229
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
230
+ }
231
+
232
+ template <typename TuningEnvT,
233
+ typename InputIteratorT,
234
+ typename OutputIteratorT,
235
+ typename ReductionOpT,
236
+ typename TransformOpT,
237
+ typename T,
238
+ typename NumItemsT>
239
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
240
+ void* d_temp_storage,
241
+ size_t& temp_storage_bytes,
242
+ InputIteratorT d_in,
243
+ OutputIteratorT d_out,
244
+ NumItemsT num_items,
245
+ ReductionOpT reduction_op,
246
+ TransformOpT transform_op,
247
+ T init,
248
+ ::cuda::execution::determinism::not_guaranteed_t,
249
+ cudaStream_t stream)
250
+ {
251
+ using offset_t = detail::choose_offset_t<NumItemsT>;
252
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
253
+
254
+ using output_t = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator_t<OutputIteratorT>;
255
+
256
+ using reduce_tuning_t = ::cuda::std::execution::
257
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
258
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
259
+ using dispatch_t = detail::
260
+ DispatchReduceNondeterministic<InputIteratorT, output_t, offset_t, ReductionOpT, T, accum_t, TransformOpT, policy_t>;
261
+
262
+ return dispatch_t::Dispatch(
263
+ d_temp_storage,
264
+ temp_storage_bytes,
265
+ d_in,
266
+ THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(d_out),
267
+ static_cast<offset_t>(num_items),
268
+ reduction_op,
269
+ init,
270
+ stream,
271
+ transform_op);
272
+ }
273
+
274
+ public:
275
+ //! @rst
276
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
277
+ //!
278
+ //! - Does not support binary reduction operators that are non-commutative.
279
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
280
+ //! (e.g., addition of floating point types) on the same GPU device.
281
+ //! However, results for pseudo-associative reduction may be inconsistent
282
+ //! from one device to a another device of a different compute-capability
283
+ //! because CUB can employ different tile-sizing for different architectures.
284
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
285
+ //! - @devicestorage
286
+ //!
287
+ //! Snippet
288
+ //! +++++++++++++++++++++++++++++++++++++++++++++
289
+ //!
290
+ //! The code snippet below illustrates a user-defined min-reduction of a
291
+ //! device vector of ``int`` data elements.
292
+ //!
293
+ //! .. code-block:: c++
294
+ //!
295
+ //! #include <cub/cub.cuh>
296
+ //! // or equivalently <cub/device/device_reduce.cuh>
297
+ //!
298
+ //! // CustomMin functor
299
+ //! struct CustomMin
300
+ //! {
301
+ //! template <typename T>
302
+ //! __device__ __forceinline__
303
+ //! T operator()(const T &a, const T &b) const {
304
+ //! return (b < a) ? b : a;
305
+ //! }
306
+ //! };
307
+ //!
308
+ //! // Declare, allocate, and initialize device-accessible pointers for
309
+ //! // input and output
310
+ //! int num_items; // e.g., 7
311
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
312
+ //! int *d_out; // e.g., [-]
313
+ //! CustomMin min_op;
314
+ //! int init; // e.g., INT_MAX
315
+ //! ...
316
+ //!
317
+ //! // Determine temporary device storage requirements
318
+ //! void *d_temp_storage = nullptr;
319
+ //! size_t temp_storage_bytes = 0;
320
+ //! cub::DeviceReduce::Reduce(
321
+ //! d_temp_storage, temp_storage_bytes,
322
+ //! d_in, d_out, num_items, min_op, init);
323
+ //!
324
+ //! // Allocate temporary storage
325
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
326
+ //!
327
+ //! // Run reduction
328
+ //! cub::DeviceReduce::Reduce(
329
+ //! d_temp_storage, temp_storage_bytes,
330
+ //! d_in, d_out, num_items, min_op, init);
331
+ //!
332
+ //! // d_out <-- [0]
333
+ //!
334
+ //! @endrst
335
+ //!
336
+ //! @tparam InputIteratorT
337
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
338
+ //!
339
+ //! @tparam OutputIteratorT
340
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
341
+ //!
342
+ //! @tparam ReductionOpT
343
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
344
+ //!
345
+ //! @tparam T
346
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
347
+ //!
348
+ //! @tparam NumItemsT
349
+ //! **[inferred]** Type of num_items
350
+ //!
351
+ //! @param[in] d_temp_storage
352
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
353
+ //! required allocation size is written to `temp_storage_bytes` and no work
354
+ //! is done.
355
+ //!
356
+ //! @param[in,out] temp_storage_bytes
357
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
358
+ //!
359
+ //! @param[in] d_in
360
+ //! Pointer to the input sequence of data items
361
+ //!
362
+ //! @param[out] d_out
363
+ //! Pointer to the output aggregate
364
+ //!
365
+ //! @param[in] num_items
366
+ //! Total number of input items (i.e., length of ``d_in``)
367
+ //!
368
+ //! @param[in] reduction_op
369
+ //! Binary reduction functor
370
+ //!
371
+ //! @param[in] init
372
+ //! Initial value of the reduction
373
+ //!
374
+ //! @param[in] stream
375
+ //! @rst
376
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
377
+ //! @endrst
378
+ template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
379
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
380
+ void* d_temp_storage,
381
+ size_t& temp_storage_bytes,
382
+ InputIteratorT d_in,
383
+ OutputIteratorT d_out,
384
+ NumItemsT num_items,
385
+ ReductionOpT reduction_op,
386
+ T init,
387
+ cudaStream_t stream = 0)
388
+ {
389
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
390
+
391
+ // Signed integer type for global offsets
392
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
393
+
394
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
395
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
396
+ }
397
+
398
+ //! @rst
399
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
400
+ //!
401
+ //! - Does not support binary reduction operators that are non-commutative.
402
+ //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
403
+ //! (e.g., addition of floating point types) on the same GPU device.
404
+ //! However, results for pseudo-associative reduction may be inconsistent
405
+ //! from one device to a another device of a different compute-capability
406
+ //! because CUB can employ different tile-sizing for different architectures.
407
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
408
+ //! as the `env` parameter.
409
+ //! To request "not-guaranteed" determinism, pass
410
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
411
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
412
+ //!
413
+ //! Snippet
414
+ //! +++++++++++++++++++++++++++++++++++++++++++++
415
+ //!
416
+ //! The code snippet below illustrates a user-defined min-reduction of a
417
+ //! device vector of ``int`` data elements.
418
+ //!
419
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
420
+ //! :language: c++
421
+ //! :dedent:
422
+ //! :start-after: example-begin reduce-env-determinism
423
+ //! :end-before: example-end reduce-env-determinism
424
+ //!
425
+ //! @endrst
426
+ //!
427
+ //! @tparam InputIteratorT
428
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
429
+ //!
430
+ //! @tparam OutputIteratorT
431
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
432
+ //!
433
+ //! @tparam ReductionOpT
434
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
435
+ //!
436
+ //! @tparam T
437
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
438
+ //!
439
+ //! @tparam NumItemsT
440
+ //! **[inferred]** Type of num_items
441
+ //!
442
+ //! @tparam EnvT
443
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
444
+ //!
445
+ //! @param[in] d_in
446
+ //! Pointer to the input sequence of data items
447
+ //!
448
+ //! @param[out] d_out
449
+ //! Pointer to the output aggregate
450
+ //!
451
+ //! @param[in] num_items
452
+ //! Total number of input items (i.e., length of ``d_in``)
453
+ //!
454
+ //! @param[in] reduction_op
455
+ //! Binary reduction functor
456
+ //!
457
+ //! @param[in] init
458
+ //! Initial value of the reduction
459
+ //!
460
+ //! @param[in] env
461
+ //! @rst
462
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
463
+ //! @endrst
464
+ template <typename InputIteratorT,
465
+ typename OutputIteratorT,
466
+ typename ReductionOpT,
467
+ typename T,
468
+ typename NumItemsT,
469
+ typename EnvT = ::cuda::std::execution::env<>>
470
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
471
+ InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
472
+ {
473
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
474
+
475
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
476
+ "Determinism should be used inside requires to have an effect.");
477
+ using requirements_t = ::cuda::std::execution::
478
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
479
+ using default_determinism_t =
480
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
481
+ ::cuda::execution::determinism::__get_determinism_t,
482
+ ::cuda::execution::determinism::run_to_run_t>;
483
+
484
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
485
+
486
+ constexpr auto gpu_gpu_determinism =
487
+ ::cuda::std::is_same_v<default_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>;
488
+
489
+ // integral types are always gpu-to-gpu deterministic if reduction operator is a simple cuda binary
490
+ // operator, so fallback to run-to-run determinism
491
+ constexpr auto integral_fallback =
492
+ gpu_gpu_determinism && ::cuda::std::is_integral_v<accum_t> && (detail::is_cuda_binary_operator<ReductionOpT>);
493
+
494
+ // use gpu-to-gpu determinism only for float and double types with ::cuda::std::plus operator
495
+ constexpr auto float_double_plus =
496
+ gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
497
+
498
+ constexpr auto supported = integral_fallback || float_double_plus || !gpu_gpu_determinism;
499
+
500
+ // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
501
+ // float and double types with ::cuda::std::plus operator
502
+ static_assert(supported, "gpu_to_gpu determinism is unsupported");
503
+
504
+ if constexpr (!supported)
505
+ {
506
+ return cudaErrorNotSupported;
507
+ }
508
+ else
509
+ {
510
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
511
+
512
+ // Certain conditions must be met to be able to use the non-deterministic
513
+ // kernel. The output iterator must be a contiguous iterator and the
514
+ // reduction operator must be plus (for now). Additionally, since atomics for types of
515
+ // size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
516
+ // determinism.
517
+ constexpr auto is_contiguous_fallback =
518
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
519
+ constexpr auto is_plus_fallback = !no_determinism || detail::is_cuda_std_plus_v<ReductionOpT>;
520
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(accum_t) >= 4;
521
+
522
+ // If the conditions for gpu-to-gpu determinism or non-deterministic
523
+ // reduction are not met, we fall back to run-to-run determinism.
524
+ using determinism_t = ::cuda::std::conditional_t<
525
+ (gpu_gpu_determinism && integral_fallback)
526
+ || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
527
+ ::cuda::execution::determinism::run_to_run_t,
528
+ default_determinism_t>;
529
+
530
+ // Query relevant properties from the environment
531
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
532
+ auto mr =
533
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
534
+
535
+ void* d_temp_storage = nullptr;
536
+ size_t temp_storage_bytes = 0;
537
+
538
+ using tuning_t = ::cuda::std::execution::
539
+ __query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
540
+
541
+ // Query the required temporary storage size
542
+ cudaError_t error = reduce_impl<tuning_t>(
543
+ d_temp_storage,
544
+ temp_storage_bytes,
545
+ d_in,
546
+ d_out,
547
+ num_items,
548
+ reduction_op,
549
+ ::cuda::std::identity{},
550
+ init,
551
+ determinism_t{},
552
+ stream.get());
553
+ if (error != cudaSuccess)
554
+ {
555
+ return error;
556
+ }
557
+
558
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
559
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
560
+ if (error != cudaSuccess)
561
+ {
562
+ return error;
563
+ }
564
+
565
+ // Run the algorithm
566
+ error = reduce_impl<tuning_t>(
567
+ d_temp_storage,
568
+ temp_storage_bytes,
569
+ d_in,
570
+ d_out,
571
+ num_items,
572
+ reduction_op,
573
+ ::cuda::std::identity{},
574
+ init,
575
+ determinism_t{},
576
+ stream.get());
577
+
578
+ // Try to deallocate regardless of the error to avoid memory leaks
579
+ cudaError_t deallocate_error =
580
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
581
+
582
+ if (error != cudaSuccess)
583
+ {
584
+ // Reduction error takes precedence over deallocation error since it happens first
585
+ return error;
586
+ }
587
+
588
+ return deallocate_error;
589
+ }
590
+ }
591
+
592
+ //! @rst
593
+ //! Computes a device-wide sum using the addition (``+``) operator.
594
+ //!
595
+ //! - Uses ``0`` as the initial value of the reduction.
596
+ //! - Does not support ``+`` operators that are non-commutative.
597
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
598
+ //! (e.g., addition of floating point types) on the same GPU device.
599
+ //! However, results for pseudo-associative reduction may be inconsistent
600
+ //! from one device to a another device of a different compute-capability
601
+ //! because CUB can employ different tile-sizing for different architectures.
602
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
603
+ //! as the `env` parameter.
604
+ //! To request "not-guaranteed" determinism, pass
605
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
606
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
607
+ //!
608
+ //! Snippet
609
+ //! +++++++++++++++++++++++++++++++++++++++++++++
610
+ //!
611
+ //! The code snippet below illustrates a user-defined min-reduction of a
612
+ //! device vector of ``int`` data elements.
613
+ //!
614
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
615
+ //! :language: c++
616
+ //! :dedent:
617
+ //! :start-after: example-begin sum-env-determinism
618
+ //! :end-before: example-end sum-env-determinism
619
+ //!
620
+ //! @endrst
621
+ //!
622
+ //! @tparam InputIteratorT
623
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
624
+ //!
625
+ //! @tparam OutputIteratorT
626
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
627
+ //!
628
+ //! @tparam NumItemsT
629
+ //! **[inferred]** Type of num_items
630
+ //!
631
+ //! @tparam EnvT
632
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
633
+ //!
634
+ //! @param[in] d_in
635
+ //! Pointer to the input sequence of data items
636
+ //!
637
+ //! @param[out] d_out
638
+ //! Pointer to the output aggregate
639
+ //!
640
+ //! @param[in] num_items
641
+ //! Total number of input items (i.e., length of ``d_in``)
642
+ //!
643
+ //! @param[in] env
644
+ //! @rst
645
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
646
+ //! @endrst
647
+ template <typename InputIteratorT,
648
+ typename OutputIteratorT,
649
+ typename NumItemsT,
650
+ typename EnvT = ::cuda::std::execution::env<>>
651
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
652
+ Sum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
653
+ {
654
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
655
+
656
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
657
+ "Determinism should be used inside requires to have an effect.");
658
+ using requirements_t = ::cuda::std::execution::
659
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
660
+ using default_determinism_t =
661
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
662
+ ::cuda::execution::determinism::__get_determinism_t,
663
+ ::cuda::execution::determinism::run_to_run_t>;
664
+
665
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
666
+
667
+ // The output iterator must be a contiguous iterator or we fall back to
668
+ // run-to-run determinism.
669
+ constexpr auto is_contiguous_fallback =
670
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
671
+
672
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
673
+
674
+ // Since atomics for types of size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
675
+ // determinism.
676
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(OutputT) >= 4;
677
+
678
+ using determinism_t =
679
+ ::cuda::std::conditional_t<no_determinism && !(is_contiguous_fallback && is_4b_or_greater),
680
+ ::cuda::execution::determinism::run_to_run_t,
681
+ default_determinism_t>;
682
+
683
+ // Query relevant properties from the environment
684
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
685
+ auto mr =
686
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
687
+
688
+ void* d_temp_storage = nullptr;
689
+ size_t temp_storage_bytes = 0;
690
+
691
+ using tuning_t =
692
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
693
+
694
+ using InitT = OutputT;
695
+
696
+ // Query the required temporary storage size
697
+ cudaError_t error = reduce_impl<tuning_t>(
698
+ d_temp_storage,
699
+ temp_storage_bytes,
700
+ d_in,
701
+ d_out,
702
+ num_items,
703
+ ::cuda::std::plus<>{},
704
+ ::cuda::std::identity{},
705
+ InitT{}, // zero-initialize
706
+ determinism_t{},
707
+ stream.get());
708
+ if (error != cudaSuccess)
709
+ {
710
+ return error;
711
+ }
712
+
713
+ // TODO(gevtushenko): use uninitialized buffer when it's available
714
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
715
+ if (error != cudaSuccess)
716
+ {
717
+ return error;
718
+ }
719
+
720
+ // Run the algorithm
721
+ error = reduce_impl<tuning_t>(
722
+ d_temp_storage,
723
+ temp_storage_bytes,
724
+ d_in,
725
+ d_out,
726
+ num_items,
727
+ ::cuda::std::plus<>{},
728
+ ::cuda::std::identity{},
729
+ InitT{}, // zero-initialize
730
+ determinism_t{},
731
+ stream.get());
732
+
733
+ // Try to deallocate regardless of the error to avoid memory leaks
734
+ cudaError_t deallocate_error =
735
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
736
+
737
+ if (error != cudaSuccess)
738
+ {
739
+ // Reduction error takes precedence over deallocation error since it happens first
740
+ return error;
741
+ }
742
+
743
+ return deallocate_error;
744
+ }
745
+
746
+ //! @rst
747
+ //! Computes a device-wide sum using the addition (``+``) operator.
748
+ //!
749
+ //! - Uses ``0`` as the initial value of the reduction.
750
+ //! - Does not support ``+`` operators that are non-commutative.
751
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
752
+ //! (e.g., addition of floating point types) on the same GPU device.
753
+ //! However, results for pseudo-associative reduction may be inconsistent
754
+ //! from one device to a another device of a different compute-capability
755
+ //! because CUB can employ different tile-sizing for different architectures.
756
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
757
+ //! - @devicestorage
758
+ //!
759
+ //! Snippet
760
+ //! +++++++++++++++++++++++++++++++++++++++++++++
761
+ //!
762
+ //! The code snippet below illustrates the sum-reduction of a device vector
763
+ //! of ``int`` data elements.
764
+ //!
765
+ //! .. code-block:: c++
766
+ //!
767
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
768
+ //!
769
+ //! // Declare, allocate, and initialize device-accessible pointers
770
+ //! // for input and output
771
+ //! int num_items; // e.g., 7
772
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
773
+ //! int *d_out; // e.g., [-]
774
+ //! ...
775
+ //!
776
+ //! // Determine temporary device storage requirements
777
+ //! void *d_temp_storage = nullptr;
778
+ //! size_t temp_storage_bytes = 0;
779
+ //! cub::DeviceReduce::Sum(
780
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
781
+ //!
782
+ //! // Allocate temporary storage
783
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
784
+ //!
785
+ //! // Run sum-reduction
786
+ //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
787
+ //!
788
+ //! // d_out <-- [38]
789
+ //!
790
+ //! @endrst
791
+ //!
792
+ //! @tparam InputIteratorT
793
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
794
+ //!
795
+ //! @tparam OutputIteratorT
796
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
797
+ //!
798
+ //! @tparam NumItemsT
799
+ //! **[inferred]** Type of num_items
800
+ //!
801
+ //! @param[in] d_temp_storage
802
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
803
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
804
+ //!
805
+ //! @param[in,out] temp_storage_bytes
806
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
807
+ //!
808
+ //! @param[in] d_in
809
+ //! Pointer to the input sequence of data items
810
+ //!
811
+ //! @param[out] d_out
812
+ //! Pointer to the output aggregate
813
+ //!
814
+ //! @param[in] num_items
815
+ //! Total number of input items (i.e., length of `d_in`)
816
+ //!
817
+ //! @param[in] stream
818
+ //! @rst
819
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
820
+ //! @endrst
821
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
822
+ CUB_RUNTIME_FUNCTION static cudaError_t
823
+ Sum(void* d_temp_storage,
824
+ size_t& temp_storage_bytes,
825
+ InputIteratorT d_in,
826
+ OutputIteratorT d_out,
827
+ NumItemsT num_items,
828
+ cudaStream_t stream = 0)
829
+ {
830
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
831
+
832
+ // Signed integer type for global offsets
833
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
834
+
835
+ // The output value type
836
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
837
+
838
+ using InitT = OutputT;
839
+
840
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::std::plus<>, InitT>::Dispatch(
841
+ d_temp_storage,
842
+ temp_storage_bytes,
843
+ d_in,
844
+ d_out,
845
+ static_cast<OffsetT>(num_items),
846
+ ::cuda::std::plus<>{},
847
+ InitT{}, // zero-initialize
848
+ stream);
849
+ }
850
+
851
+ //! @rst
852
+ //! Computes a device-wide minimum using the less-than (``<``) operator.
853
+ //!
854
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
855
+ //! - Does not support ``<`` operators that are non-commutative.
856
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
857
+ //! (e.g., addition of floating point types) on the same GPU device.
858
+ //! However, results for pseudo-associative reduction may be inconsistent
859
+ //! from one device to a another device of a different compute-capability
860
+ //! because CUB can employ different tile-sizing for different architectures.
861
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
862
+ //! - @devicestorage
863
+ //!
864
+ //! Snippet
865
+ //! +++++++++++++++++++++++++++++++++++++++++++++
866
+ //!
867
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
868
+ //!
869
+ //! .. code-block:: c++
870
+ //!
871
+ //! #include <cub/cub.cuh>
872
+ //! // or equivalently <cub/device/device_reduce.cuh>
873
+ //!
874
+ //! // Declare, allocate, and initialize device-accessible pointers
875
+ //! // for input and output
876
+ //! int num_items; // e.g., 7
877
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
878
+ //! int *d_out; // e.g., [-]
879
+ //! ...
880
+ //!
881
+ //! // Determine temporary device storage requirements
882
+ //! void *d_temp_storage = nullptr;
883
+ //! size_t temp_storage_bytes = 0;
884
+ //! cub::DeviceReduce::Min(
885
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
886
+ //!
887
+ //! // Allocate temporary storage
888
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
889
+ //!
890
+ //! // Run min-reduction
891
+ //! cub::DeviceReduce::Min(
892
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
893
+ //!
894
+ //! // d_out <-- [0]
895
+ //!
896
+ //! @endrst
897
+ //!
898
+ //! @tparam InputIteratorT
899
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
900
+ //!
901
+ //! @tparam OutputIteratorT
902
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
903
+ //!
904
+ //! @tparam NumItemsT
905
+ //! **[inferred]** Type of num_items
906
+ //!
907
+ //! @param[in] d_temp_storage
908
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
909
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
910
+ //!
911
+ //! @param[in,out] temp_storage_bytes
912
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
913
+ //!
914
+ //! @param[in] d_in
915
+ //! Pointer to the input sequence of data items
916
+ //!
917
+ //! @param[out] d_out
918
+ //! Pointer to the output aggregate
919
+ //!
920
+ //! @param[in] num_items
921
+ //! Total number of input items (i.e., length of ``d_in``)
922
+ //!
923
+ //! @param[in] stream
924
+ //! @rst
925
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
926
+ //! @endrst
927
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
928
+ CUB_RUNTIME_FUNCTION static cudaError_t
929
+ Min(void* d_temp_storage,
930
+ size_t& temp_storage_bytes,
931
+ InputIteratorT d_in,
932
+ OutputIteratorT d_out,
933
+ NumItemsT num_items,
934
+ cudaStream_t stream = 0)
935
+ {
936
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
937
+
938
+ using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
939
+ using InputT = detail::it_value_t<InputIteratorT>;
940
+ using InitT = InputT;
941
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
942
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
943
+ static_assert(limits_t::is_specialized,
944
+ "cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
945
+ "value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This is "
946
+ "probably a bug and you should specialize cuda::std::numeric_limits. Define "
947
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
948
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
949
+
950
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::minimum<>, InitT>::Dispatch(
951
+ d_temp_storage,
952
+ temp_storage_bytes,
953
+ d_in,
954
+ d_out,
955
+ static_cast<OffsetT>(num_items),
956
+ ::cuda::minimum<>{},
957
+ limits_t::max(),
958
+ stream);
959
+ }
960
+
961
+ //! @rst
962
+ //! Computes a device-wide minimum using the less-than (``<``) operator. The result is written to the output
963
+ //! iterator.
964
+ //!
965
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
966
+ //! - Provides determinism based on the environment's determinism requirements.
967
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
968
+ //! as the `env` parameter.
969
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
970
+ //!
971
+ //! Snippet
972
+ //! +++++++++++++++++++++++++++++++++++++++++++++
973
+ //!
974
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
975
+ //!
976
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
977
+ //! :language: c++
978
+ //! :dedent:
979
+ //! :start-after: example-begin min-env-determinism
980
+ //! :end-before: example-end min-env-determinism
981
+ //!
982
+ //! @endrst
983
+ //!
984
+ //! @tparam InputIteratorT
985
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
986
+ //!
987
+ //! @tparam OutputIteratorT
988
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
989
+ //!
990
+ //! @tparam NumItemsT
991
+ //! **[inferred]** Type of num_items
992
+ //!
993
+ //! @tparam EnvT
994
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
995
+ //!
996
+ //! @param[in] d_in
997
+ //! Pointer to the input sequence of data items
998
+ //!
999
+ //! @param[out] d_out
1000
+ //! Pointer to the output aggregate
1001
+ //!
1002
+ //! @param[in] num_items
1003
+ //! Total number of input items (i.e., length of ``d_in``)
1004
+ //!
1005
+ //! @param[in] env
1006
+ //! @rst
1007
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
1008
+ //! @endrst
1009
+ template <typename InputIteratorT,
1010
+ typename OutputIteratorT,
1011
+ typename NumItemsT,
1012
+ typename EnvT = ::cuda::std::execution::env<>>
1013
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1014
+ Min(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1015
+ {
1016
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Min");
1017
+
1018
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1019
+ "Determinism should be used inside requires to have an effect.");
1020
+ using requirements_t =
1021
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1022
+ using requested_determinism_t =
1023
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1024
+ _CUDA_EXEC::determinism::__get_determinism_t,
1025
+ _CUDA_EXEC::determinism::run_to_run_t>;
1026
+
1027
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1028
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1029
+ "gpu_to_gpu determinism is not supported");
1030
+
1031
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1032
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1033
+
1034
+ // Query relevant properties from the environment
1035
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1036
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1037
+
1038
+ void* d_temp_storage = nullptr;
1039
+ size_t temp_storage_bytes = 0;
1040
+
1041
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1042
+
1043
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1044
+
1045
+ using InitT = OutputT;
1046
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1047
+
1048
+ // Query the required temporary storage size
1049
+ cudaError_t error = reduce_impl<tuning_t>(
1050
+ d_temp_storage,
1051
+ temp_storage_bytes,
1052
+ d_in,
1053
+ d_out,
1054
+ num_items,
1055
+ ::cuda::minimum<>{},
1056
+ ::cuda::std::identity{},
1057
+ limits_t::max(),
1058
+ determinism_t{},
1059
+ stream.get());
1060
+ if (error != cudaSuccess)
1061
+ {
1062
+ return error;
1063
+ }
1064
+
1065
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1066
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1067
+ if (error != cudaSuccess)
1068
+ {
1069
+ return error;
1070
+ }
1071
+
1072
+ // Run the algorithm
1073
+ error = reduce_impl<tuning_t>(
1074
+ d_temp_storage,
1075
+ temp_storage_bytes,
1076
+ d_in,
1077
+ d_out,
1078
+ num_items,
1079
+ ::cuda::minimum<>{},
1080
+ ::cuda::std::identity{},
1081
+ limits_t::max(),
1082
+ determinism_t{},
1083
+ stream.get());
1084
+
1085
+ // Try to deallocate regardless of the error to avoid memory leaks
1086
+ cudaError_t deallocate_error =
1087
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1088
+
1089
+ if (error != cudaSuccess)
1090
+ {
1091
+ // Reduction error takes precedence over deallocation error since it happens first
1092
+ return error;
1093
+ }
1094
+
1095
+ return deallocate_error;
1096
+ }
1097
+
1098
+ //! @rst
1099
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1100
+ //!
1101
+ //! - The minimum is written to ``d_min_out``
1102
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1103
+ //! ``cuda::std::int64_t``.
1104
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1105
+ //! ``1`` is written to ``d_index_out``.
1106
+ //! - Does not support ``<`` operators that are non-commutative.
1107
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1108
+ //! (e.g., addition of floating point types) on the same GPU device.
1109
+ //! However, results for pseudo-associative reduction may be inconsistent
1110
+ //! from one device to a another device of a different compute-capability
1111
+ //! because CUB can employ different tile-sizing for different architectures.
1112
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1113
+ //! - @devicestorage
1114
+ //!
1115
+ //! Snippet
1116
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1117
+ //!
1118
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1119
+ //! of ``int`` data elements.
1120
+ //!
1121
+ //! .. code-block:: c++
1122
+ //!
1123
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1124
+ //! #include <cuda/std/cstdint>
1125
+ //!
1126
+ //! // Declare, allocate, and initialize device-accessible pointers
1127
+ //! // for input and output
1128
+ //! int num_items; // e.g., 7
1129
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1130
+ //! int *d_min_out; // memory for the minimum value
1131
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1132
+ //! ...
1133
+ //!
1134
+ //! // Determine temporary device storage requirements
1135
+ //! void *d_temp_storage = nullptr;
1136
+ //! size_t temp_storage_bytes = 0;
1137
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1138
+ //! num_items);
1139
+ //!
1140
+ //! // Allocate temporary storage
1141
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1142
+ //!
1143
+ //! // Run argmin-reduction
1144
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1145
+ //! num_items);
1146
+ //!
1147
+ //! // d_min_out <-- 0
1148
+ //! // d_index_out <-- 5
1149
+ //!
1150
+ //! @endrst
1151
+ //!
1152
+ //! @tparam InputIteratorT
1153
+ //! **[inferred]** Random-access input iterator type for reading input items
1154
+ //! (of some type `T`) @iterator
1155
+ //!
1156
+ //! @tparam ExtremumOutIteratorT
1157
+ //! **[inferred]** Output iterator type for recording minimum value
1158
+ //!
1159
+ //! @tparam IndexOutIteratorT
1160
+ //! **[inferred]** Output iterator type for recording index of the returned value
1161
+ //!
1162
+ //! @param[in] d_temp_storage
1163
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1164
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1165
+ //!
1166
+ //! @param[in,out] temp_storage_bytes
1167
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1168
+ //!
1169
+ //! @param[in] d_in
1170
+ //! Iterator to the input sequence of data items
1171
+ //!
1172
+ //! @param[out] d_min_out
1173
+ //! Iterator to which the minimum value is written
1174
+ //!
1175
+ //! @param[out] d_index_out
1176
+ //! Iterator to which the index of the returned value is written
1177
+ //!
1178
+ //! @param[in] num_items
1179
+ //! Total number of input items (i.e., length of ``d_in``)
1180
+ //!
1181
+ //! @param[in] stream
1182
+ //! @rst
1183
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1184
+ //! @endrst
1185
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1186
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
1187
+ void* d_temp_storage,
1188
+ size_t& temp_storage_bytes,
1189
+ InputIteratorT d_in,
1190
+ ExtremumOutIteratorT d_min_out,
1191
+ IndexOutIteratorT d_index_out,
1192
+ ::cuda::std::int64_t num_items,
1193
+ cudaStream_t stream = 0)
1194
+ {
1195
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1196
+
1197
+ // The input type
1198
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1199
+
1200
+ // Offset type used within the kernel and to index within one partition
1201
+ using PerPartitionOffsetT = int;
1202
+
1203
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1204
+ using GlobalOffsetT = ::cuda::std::int64_t;
1205
+
1206
+ // The value type used for the extremum
1207
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1208
+ using InitT = OutputExtremumT;
1209
+
1210
+ // Reduction operation
1211
+ using ReduceOpT = cub::ArgMin;
1212
+
1213
+ // Initial value
1214
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1215
+
1216
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1217
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1218
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1219
+
1220
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1221
+ InputIteratorT,
1222
+ decltype(out_it),
1223
+ PerPartitionOffsetT,
1224
+ GlobalOffsetT,
1225
+ ReduceOpT,
1226
+ InitT>::Dispatch(d_temp_storage,
1227
+ temp_storage_bytes,
1228
+ d_in,
1229
+ out_it,
1230
+ static_cast<GlobalOffsetT>(num_items),
1231
+ ReduceOpT{},
1232
+ initial_value,
1233
+ stream);
1234
+ }
1235
+
1236
+ //! @rst
1237
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1238
+ //!
1239
+ //! - The minimum is written to ``d_min_out``
1240
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1241
+ //! ``cuda::std::int64_t``.
1242
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1243
+ //! ``1`` is written to ``d_index_out``.
1244
+ //! - Does not support ``<`` operators that are non-commutative.
1245
+ //! - Provides determinism based on the environment's determinism requirements.
1246
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1247
+ //! as the `env` parameter.
1248
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1249
+ //!
1250
+ //! Snippet
1251
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1252
+ //!
1253
+ //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
1254
+ //!
1255
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1256
+ //! :language: c++
1257
+ //! :dedent:
1258
+ //! :start-after: example-begin argmin-env-determinism
1259
+ //! :end-before: example-end argmin-env-determinism
1260
+ //!
1261
+ //! @endrst
1262
+ //!
1263
+ //! @tparam InputIteratorT
1264
+ //! **[inferred]** Random-access input iterator type for reading input items
1265
+ //! (of some type `T`) @iterator
1266
+ //!
1267
+ //! @tparam ExtremumOutIteratorT
1268
+ //! **[inferred]** Output iterator type for recording minimum value
1269
+ //!
1270
+ //! @tparam IndexOutIteratorT
1271
+ //! **[inferred]** Output iterator type for recording index of the returned value
1272
+ //!
1273
+ //! @tparam EnvT
1274
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
1275
+ //!
1276
+ //! @param[in] d_in
1277
+ //! Iterator to the input sequence of data items
1278
+ //!
1279
+ //! @param[out] d_min_out
1280
+ //! Iterator to which the minimum value is written
1281
+ //!
1282
+ //! @param[out] d_index_out
1283
+ //! Iterator to which the index of the returned value is written
1284
+ //!
1285
+ //! @param[in] num_items
1286
+ //! Total number of input items (i.e., length of ``d_in``)
1287
+ //!
1288
+ //! @param[in] env
1289
+ //! @rst
1290
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1291
+ //! @endrst
1292
+ template <typename InputIteratorT,
1293
+ typename ExtremumOutIteratorT,
1294
+ typename IndexOutIteratorT,
1295
+ typename EnvT = ::cuda::std::execution::env<>>
1296
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1297
+ ArgMin(InputIteratorT d_in,
1298
+ ExtremumOutIteratorT d_min_out,
1299
+ IndexOutIteratorT d_index_out,
1300
+ ::cuda::std::int64_t num_items,
1301
+ EnvT env = {})
1302
+ {
1303
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
1304
+
1305
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1306
+ "Determinism should be used inside requires to have an effect.");
1307
+ using requirements_t =
1308
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1309
+ using requested_determinism_t =
1310
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1311
+ _CUDA_EXEC::determinism::__get_determinism_t,
1312
+ _CUDA_EXEC::determinism::run_to_run_t>;
1313
+
1314
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1315
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1316
+ "gpu_to_gpu determinism is not supported");
1317
+
1318
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1319
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1320
+
1321
+ // Query relevant properties from the environment
1322
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1323
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1324
+
1325
+ void* d_temp_storage = nullptr;
1326
+ size_t temp_storage_bytes = 0;
1327
+
1328
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1329
+
1330
+ // Reduction operation
1331
+ using ReduceOpT = cub::ArgMin;
1332
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1333
+ using PerPartitionOffsetT = int;
1334
+ using GlobalOffsetT = ::cuda::std::int64_t;
1335
+
1336
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1337
+ using InitT = OutputExtremumT;
1338
+
1339
+ // Initial value
1340
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1341
+
1342
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1343
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1344
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1345
+
1346
+ // Query the required temporary storage size
1347
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
1348
+ InputIteratorT,
1349
+ decltype(out_it),
1350
+ PerPartitionOffsetT,
1351
+ GlobalOffsetT,
1352
+ ReduceOpT,
1353
+ InitT>::Dispatch(d_temp_storage,
1354
+ temp_storage_bytes,
1355
+ d_in,
1356
+ out_it,
1357
+ static_cast<GlobalOffsetT>(num_items),
1358
+ ReduceOpT{},
1359
+ initial_value,
1360
+ stream.get());
1361
+ if (error != cudaSuccess)
1362
+ {
1363
+ return error;
1364
+ }
1365
+
1366
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1367
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1368
+ if (error != cudaSuccess)
1369
+ {
1370
+ return error;
1371
+ }
1372
+
1373
+ // Run the algorithm
1374
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
1375
+ InputIteratorT,
1376
+ decltype(out_it),
1377
+ PerPartitionOffsetT,
1378
+ GlobalOffsetT,
1379
+ ReduceOpT,
1380
+ InitT>::Dispatch(d_temp_storage,
1381
+ temp_storage_bytes,
1382
+ d_in,
1383
+ out_it,
1384
+ static_cast<GlobalOffsetT>(num_items),
1385
+ ReduceOpT{},
1386
+ initial_value,
1387
+ stream.get());
1388
+
1389
+ // Try to deallocate regardless of the error to avoid memory leaks
1390
+ cudaError_t deallocate_error =
1391
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1392
+
1393
+ if (error != cudaSuccess)
1394
+ {
1395
+ // Reduction error takes precedence over deallocation error since it happens first
1396
+ return error;
1397
+ }
1398
+
1399
+ return deallocate_error;
1400
+ }
1401
+
1402
+ //! @rst
1403
+ //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
1404
+ //!
1405
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1406
+ //! (assuming the value type of ``d_in`` is ``T``)
1407
+ //!
1408
+ //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
1409
+ //! - The ``{1, cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
1410
+ //!
1411
+ //! - Does not support ``<`` operators that are non-commutative.
1412
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1413
+ //! (e.g., addition of floating point types) on the same GPU device.
1414
+ //! However, results for pseudo-associative reduction may be inconsistent
1415
+ //! from one device to a another device of a different compute-capability
1416
+ //! because CUB can employ different tile-sizing for different architectures.
1417
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1418
+ //! - @devicestorage
1419
+ //!
1420
+ //! Snippet
1421
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1422
+ //!
1423
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1424
+ //! of ``int`` data elements.
1425
+ //!
1426
+ //! .. code-block:: c++
1427
+ //!
1428
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1429
+ //!
1430
+ //! // Declare, allocate, and initialize device-accessible pointers
1431
+ //! // for input and output
1432
+ //! int num_items; // e.g., 7
1433
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1434
+ //! KeyValuePair<int, int> *d_argmin; // e.g., [{-,-}]
1435
+ //! ...
1436
+ //!
1437
+ //! // Determine temporary device storage requirements
1438
+ //! void *d_temp_storage = nullptr;
1439
+ //! size_t temp_storage_bytes = 0;
1440
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1441
+ //!
1442
+ //! // Allocate temporary storage
1443
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1444
+ //!
1445
+ //! // Run argmin-reduction
1446
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1447
+ //!
1448
+ //! // d_argmin <-- [{5, 0}]
1449
+ //!
1450
+ //! @endrst
1451
+ //!
1452
+ //! @tparam InputIteratorT
1453
+ //! **[inferred]** Random-access input iterator type for reading input items
1454
+ //! (of some type `T`) @iterator
1455
+ //!
1456
+ //! @tparam OutputIteratorT
1457
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1458
+ //! (having value type ``cub::KeyValuePair<int, T>``) @iterator
1459
+ //!
1460
+ //! @param[in] d_temp_storage
1461
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1462
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1463
+ //!
1464
+ //! @param[in,out] temp_storage_bytes
1465
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1466
+ //!
1467
+ //! @param[in] d_in
1468
+ //! Pointer to the input sequence of data items
1469
+ //!
1470
+ //! @param[out] d_out
1471
+ //! Pointer to the output aggregate
1472
+ //!
1473
+ //! @param[in] num_items
1474
+ //! Total number of input items (i.e., length of ``d_in``)
1475
+ //!
1476
+ //! @param[in] stream
1477
+ //! @rst
1478
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1479
+ //! @endrst
1480
+ template <typename InputIteratorT, typename OutputIteratorT>
1481
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMin interface that takes two separate "
1482
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1483
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1484
+ ArgMin(void* d_temp_storage,
1485
+ size_t& temp_storage_bytes,
1486
+ InputIteratorT d_in,
1487
+ OutputIteratorT d_out,
1488
+ int num_items,
1489
+ cudaStream_t stream = 0)
1490
+ {
1491
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1492
+
1493
+ // Signed integer type for global offsets
1494
+ using OffsetT = int;
1495
+
1496
+ // The input type
1497
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1498
+
1499
+ // The output tuple type
1500
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1501
+
1502
+ using AccumT = OutputTupleT;
1503
+
1504
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1505
+
1506
+ // The output value type
1507
+ using OutputValueT = typename OutputTupleT::Value;
1508
+
1509
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1510
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1511
+
1512
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1513
+
1514
+ // Initial value
1515
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
1516
+
1517
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
1518
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
1519
+ }
1520
+
1521
+ //! @rst
1522
+ //! Computes a device-wide maximum using the greater-than (``>``) operator.
1523
+ //!
1524
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1525
+ //! - Does not support ``>`` operators that are non-commutative.
1526
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1527
+ //! (e.g., addition of floating point types) on the same GPU device.
1528
+ //! However, results for pseudo-associative reduction may be inconsistent
1529
+ //! from one device to a another device of a different compute-capability
1530
+ //! because CUB can employ different tile-sizing for different architectures.
1531
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1532
+ //! - @devicestorage
1533
+ //!
1534
+ //! Snippet
1535
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1536
+ //!
1537
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1538
+ //!
1539
+ //! .. code-block:: c++
1540
+ //!
1541
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1542
+ //!
1543
+ //! // Declare, allocate, and initialize device-accessible pointers
1544
+ //! // for input and output
1545
+ //! int num_items; // e.g., 7
1546
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1547
+ //! int *d_max; // e.g., [-]
1548
+ //! ...
1549
+ //!
1550
+ //! // Determine temporary device storage requirements
1551
+ //! void *d_temp_storage = nullptr;
1552
+ //! size_t temp_storage_bytes = 0;
1553
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1554
+ //!
1555
+ //! // Allocate temporary storage
1556
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1557
+ //!
1558
+ //! // Run max-reduction
1559
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1560
+ //!
1561
+ //! // d_max <-- [9]
1562
+ //!
1563
+ //! @endrst
1564
+ //!
1565
+ //! @tparam InputIteratorT
1566
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1567
+ //!
1568
+ //! @tparam OutputIteratorT
1569
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1570
+ //!
1571
+ //! @tparam NumItemsT
1572
+ //! **[inferred]** Type of num_items
1573
+ //!
1574
+ //! @param[in] d_temp_storage
1575
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1576
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1577
+ //!
1578
+ //! @param[in,out] temp_storage_bytes
1579
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1580
+ //!
1581
+ //! @param[in] d_in
1582
+ //! Pointer to the input sequence of data items
1583
+ //!
1584
+ //! @param[out] d_out
1585
+ //! Pointer to the output aggregate
1586
+ //!
1587
+ //! @param[in] num_items
1588
+ //! Total number of input items (i.e., length of ``d_in``)
1589
+ //!
1590
+ //! @param[in] stream
1591
+ //! @rst
1592
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1593
+ //! @endrst
1594
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1595
+ CUB_RUNTIME_FUNCTION static cudaError_t
1596
+ Max(void* d_temp_storage,
1597
+ size_t& temp_storage_bytes,
1598
+ InputIteratorT d_in,
1599
+ OutputIteratorT d_out,
1600
+ NumItemsT num_items,
1601
+ cudaStream_t stream = 0)
1602
+ {
1603
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
1604
+
1605
+ // Signed integer type for global offsets
1606
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1607
+ using InputT = detail::it_value_t<InputIteratorT>;
1608
+ using InitT = InputT;
1609
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1610
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1611
+ static_assert(limits_t::is_specialized,
1612
+ "cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
1613
+ "initial value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This "
1614
+ "is probably a bug and you should specialize cuda::std::numeric_limits. Define "
1615
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
1616
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1617
+
1618
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::maximum<>, InitT>::Dispatch(
1619
+ d_temp_storage,
1620
+ temp_storage_bytes,
1621
+ d_in,
1622
+ d_out,
1623
+ static_cast<OffsetT>(num_items),
1624
+ ::cuda::maximum<>{},
1625
+ limits_t::lowest(),
1626
+ stream);
1627
+ }
1628
+
1629
+ //! @rst
1630
+ //! Computes a device-wide maximum using the greater-than (``>``) operator. The result is written to the output
1631
+ //! iterator.
1632
+ //!
1633
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1634
+ //! - Provides determinism based on the environment's determinism requirements.
1635
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1636
+ //! as the `env` parameter.
1637
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1638
+ //!
1639
+ //! Snippet
1640
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1641
+ //!
1642
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1643
+ //!
1644
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1645
+ //! :language: c++
1646
+ //! :dedent:
1647
+ //! :start-after: example-begin max-env-determinism
1648
+ //! :end-before: example-end max-env-determinism
1649
+ //!
1650
+ //! @endrst
1651
+ //!
1652
+ //! @tparam InputIteratorT
1653
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1654
+ //!
1655
+ //! @tparam OutputIteratorT
1656
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1657
+ //!
1658
+ //! @tparam NumItemsT
1659
+ //! **[inferred]** Type of num_items
1660
+ //!
1661
+ //! @tparam EnvT
1662
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
1663
+ //!
1664
+ //! @param[in] d_in
1665
+ //! Pointer to the input sequence of data items
1666
+ //!
1667
+ //! @param[out] d_out
1668
+ //! Pointer to the output aggregate
1669
+ //!
1670
+ //! @param[in] num_items
1671
+ //! Total number of input items (i.e., length of ``d_in``)
1672
+ //!
1673
+ //! @param[in] env
1674
+ //! @rst
1675
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1676
+ //! @endrst
1677
+ template <typename InputIteratorT,
1678
+ typename OutputIteratorT,
1679
+ typename NumItemsT,
1680
+ typename EnvT = ::cuda::std::execution::env<>>
1681
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1682
+ Max(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1683
+ {
1684
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Max");
1685
+
1686
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1687
+ "Determinism should be used inside requires to have an effect.");
1688
+ using requirements_t =
1689
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1690
+ using requested_determinism_t =
1691
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1692
+ _CUDA_EXEC::determinism::__get_determinism_t,
1693
+ _CUDA_EXEC::determinism::run_to_run_t>;
1694
+
1695
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1696
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1697
+ "gpu_to_gpu determinism is not supported");
1698
+
1699
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1700
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1701
+
1702
+ // Query relevant properties from the environment
1703
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1704
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1705
+
1706
+ void* d_temp_storage = nullptr;
1707
+ size_t temp_storage_bytes = 0;
1708
+
1709
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1710
+
1711
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1712
+
1713
+ using InitT = OutputT;
1714
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1715
+
1716
+ // Query the required temporary storage size
1717
+ cudaError_t error = reduce_impl<tuning_t>(
1718
+ d_temp_storage,
1719
+ temp_storage_bytes,
1720
+ d_in,
1721
+ d_out,
1722
+ num_items,
1723
+ ::cuda::maximum<>{},
1724
+ ::cuda::std::identity{},
1725
+ limits_t::lowest(),
1726
+ determinism_t{},
1727
+ stream.get());
1728
+ if (error != cudaSuccess)
1729
+ {
1730
+ return error;
1731
+ }
1732
+
1733
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1734
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1735
+ if (error != cudaSuccess)
1736
+ {
1737
+ return error;
1738
+ }
1739
+
1740
+ // Run the algorithm
1741
+ error = reduce_impl<tuning_t>(
1742
+ d_temp_storage,
1743
+ temp_storage_bytes,
1744
+ d_in,
1745
+ d_out,
1746
+ num_items,
1747
+ ::cuda::maximum<>{},
1748
+ ::cuda::std::identity{},
1749
+ limits_t::lowest(),
1750
+ determinism_t{},
1751
+ stream.get());
1752
+
1753
+ // Try to deallocate regardless of the error to avoid memory leaks
1754
+ cudaError_t deallocate_error =
1755
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1756
+
1757
+ if (error != cudaSuccess)
1758
+ {
1759
+ // Reduction error takes precedence over deallocation error since it happens first
1760
+ return error;
1761
+ }
1762
+
1763
+ return deallocate_error;
1764
+ }
1765
+
1766
+ //! @rst
1767
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1768
+ //! item.
1769
+ //!
1770
+ //! - The maximum is written to ``d_max_out``
1771
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1772
+ //! ``cuda::std::int64_t``.
1773
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out`` and the index
1774
+ //! ``1`` is written to ``d_index_out``.
1775
+ //! - Does not support ``>`` operators that are non-commutative.
1776
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1777
+ //! (e.g., addition of floating point types) on the same GPU device.
1778
+ //! However, results for pseudo-associative reduction may be inconsistent
1779
+ //! from one device to a another device of a different compute-capability
1780
+ //! because CUB can employ different tile-sizing for different architectures.
1781
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1782
+ //! - @devicestorage
1783
+ //!
1784
+ //! Snippet
1785
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1786
+ //!
1787
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1788
+ //! of `int` data elements.
1789
+ //!
1790
+ //! .. code-block:: c++
1791
+ //!
1792
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1793
+ //! #include <cuda/std/cstdint>
1794
+ //!
1795
+ //! // Declare, allocate, and initialize device-accessible pointers
1796
+ //! // for input and output
1797
+ //! int num_items; // e.g., 7
1798
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1799
+ //! int *d_max_out; // memory for the maximum value
1800
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1801
+ //! ...
1802
+ //!
1803
+ //! // Determine temporary device storage requirements
1804
+ //! void *d_temp_storage = nullptr;
1805
+ //! size_t temp_storage_bytes = 0;
1806
+ //! cub::DeviceReduce::ArgMax(
1807
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1808
+ //!
1809
+ //! // Allocate temporary storage
1810
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1811
+ //!
1812
+ //! // Run argmax-reduction
1813
+ //! cub::DeviceReduce::ArgMax(
1814
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1815
+ //!
1816
+ //! // d_max_out <-- 9
1817
+ //! // d_index_out <-- 6
1818
+ //!
1819
+ //! @endrst
1820
+ //!
1821
+ //! @tparam InputIteratorT
1822
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1823
+ //!
1824
+ //! @tparam ExtremumOutIteratorT
1825
+ //! **[inferred]** Output iterator type for recording maximum value
1826
+ //!
1827
+ //! @tparam IndexOutIteratorT
1828
+ //! **[inferred]** Output iterator type for recording index of the returned value
1829
+ //!
1830
+ //! @param[in] d_temp_storage
1831
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1832
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1833
+ //!
1834
+ //! @param[in,out] temp_storage_bytes
1835
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1836
+ //!
1837
+ //! @param[in] d_in
1838
+ //! Pointer to the input sequence of data items
1839
+ //!
1840
+ //! @param[out] d_max_out
1841
+ //! Iterator to which the maximum value is written
1842
+ //!
1843
+ //! @param[out] d_index_out
1844
+ //! Iterator to which the index of the returned value is written
1845
+ //!
1846
+ //! @param[in] num_items
1847
+ //! Total number of input items (i.e., length of ``d_in``)
1848
+ //!
1849
+ //! @param[in] stream
1850
+ //! @rst
1851
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1852
+ //! @endrst
1853
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1854
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
1855
+ void* d_temp_storage,
1856
+ size_t& temp_storage_bytes,
1857
+ InputIteratorT d_in,
1858
+ ExtremumOutIteratorT d_max_out,
1859
+ IndexOutIteratorT d_index_out,
1860
+ ::cuda::std::int64_t num_items,
1861
+ cudaStream_t stream = 0)
1862
+ {
1863
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1864
+
1865
+ // The input type
1866
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1867
+
1868
+ // Offset type used within the kernel and to index within one partition
1869
+ using PerPartitionOffsetT = int;
1870
+
1871
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1872
+ using GlobalOffsetT = ::cuda::std::int64_t;
1873
+
1874
+ // The value type used for the extremum
1875
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1876
+ using InitT = OutputExtremumT;
1877
+
1878
+ // Reduction operation
1879
+ using ReduceOpT = cub::ArgMax;
1880
+
1881
+ // Initial value
1882
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1883
+
1884
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1885
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1886
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1887
+
1888
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1889
+ InputIteratorT,
1890
+ decltype(out_it),
1891
+ PerPartitionOffsetT,
1892
+ GlobalOffsetT,
1893
+ ReduceOpT,
1894
+ InitT>::Dispatch(d_temp_storage,
1895
+ temp_storage_bytes,
1896
+ d_in,
1897
+ out_it,
1898
+ static_cast<GlobalOffsetT>(num_items),
1899
+ ReduceOpT{},
1900
+ initial_value,
1901
+ stream);
1902
+ }
1903
+
1904
+ //! @rst
1905
+ //! Finds the first device-wide maximum using the greater-than (``>``)
1906
+ //! operator, also returning the index of that item
1907
+ //!
1908
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1909
+ //! (assuming the value type of ``d_in`` is ``T``)
1910
+ //!
1911
+ //! - The maximum is written to ``d_out.value`` and its offset in the input
1912
+ //! array is written to ``d_out.key``.
1913
+ //! - The ``{1, cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
1914
+ //!
1915
+ //! - Does not support ``>`` operators that are non-commutative.
1916
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1917
+ //! (e.g., addition of floating point types) on the same GPU device.
1918
+ //! However, results for pseudo-associative reduction may be inconsistent
1919
+ //! from one device to a another device of a different compute-capability
1920
+ //! because CUB can employ different tile-sizing for different architectures.
1921
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1922
+ //! - @devicestorage
1923
+ //!
1924
+ //! Snippet
1925
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1926
+ //!
1927
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1928
+ //! of `int` data elements.
1929
+ //!
1930
+ //! .. code-block:: c++
1931
+ //!
1932
+ //! #include <cub/cub.cuh>
1933
+ //! // or equivalently <cub/device/device_reduce.cuh>
1934
+ //!
1935
+ //! // Declare, allocate, and initialize device-accessible pointers
1936
+ //! // for input and output
1937
+ //! int num_items; // e.g., 7
1938
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1939
+ //! KeyValuePair<int, int> *d_argmax; // e.g., [{-,-}]
1940
+ //! ...
1941
+ //!
1942
+ //! // Determine temporary device storage requirements
1943
+ //! void *d_temp_storage = nullptr;
1944
+ //! size_t temp_storage_bytes = 0;
1945
+ //! cub::DeviceReduce::ArgMax(
1946
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1947
+ //!
1948
+ //! // Allocate temporary storage
1949
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1950
+ //!
1951
+ //! // Run argmax-reduction
1952
+ //! cub::DeviceReduce::ArgMax(
1953
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1954
+ //!
1955
+ //! // d_argmax <-- [{6, 9}]
1956
+ //!
1957
+ //! @endrst
1958
+ //!
1959
+ //! @tparam InputIteratorT
1960
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1961
+ //!
1962
+ //! @tparam OutputIteratorT
1963
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1964
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1965
+ //!
1966
+ //! @param[in] d_temp_storage
1967
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1968
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1969
+ //!
1970
+ //! @param[in,out] temp_storage_bytes
1971
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1972
+ //!
1973
+ //! @param[in] d_in
1974
+ //! Pointer to the input sequence of data items
1975
+ //!
1976
+ //! @param[out] d_out
1977
+ //! Pointer to the output aggregate
1978
+ //!
1979
+ //! @param[in] num_items
1980
+ //! Total number of input items (i.e., length of ``d_in``)
1981
+ //!
1982
+ //! @param[in] stream
1983
+ //! @rst
1984
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1985
+ //! @endrst
1986
+ template <typename InputIteratorT, typename OutputIteratorT>
1987
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface that takes two separate "
1988
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1989
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1990
+ ArgMax(void* d_temp_storage,
1991
+ size_t& temp_storage_bytes,
1992
+ InputIteratorT d_in,
1993
+ OutputIteratorT d_out,
1994
+ int num_items,
1995
+ cudaStream_t stream = 0)
1996
+ {
1997
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1998
+
1999
+ // Signed integer type for global offsets
2000
+ using OffsetT = int;
2001
+
2002
+ // The input type
2003
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
2004
+
2005
+ // The output tuple type
2006
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
2007
+
2008
+ using AccumT = OutputTupleT;
2009
+
2010
+ // The output value type
2011
+ using OutputValueT = typename OutputTupleT::Value;
2012
+
2013
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
2014
+
2015
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
2016
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
2017
+
2018
+ ArgIndexInputIteratorT d_indexed_in(d_in);
2019
+
2020
+ // Initial value
2021
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
2022
+
2023
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
2024
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
2025
+ }
2026
+
2027
+ //! @rst
2028
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
2029
+ //! item.
2030
+ //!
2031
+ //! - The maximum is written to ``d_max_out``
2032
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
2033
+ //! ``cuda::std::int64_t``.
2034
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::lowest()}`` is written to ``d_max_out`` and the index
2035
+ //! ``1`` is written to ``d_index_out``.
2036
+ //! - Does not support ``>`` operators that are non-commutative.
2037
+ //! - Provides determinism based on the environment's determinism requirements.
2038
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
2039
+ //! as the `env` parameter.
2040
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_max_out`` nor ``d_index_out``.
2041
+ //!
2042
+ //! Snippet
2043
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2044
+ //!
2045
+ //! The code snippet below illustrates the argmax-reduction of a device vector of ``int`` data elements.
2046
+ //!
2047
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
2048
+ //! :language: c++
2049
+ //! :dedent:
2050
+ //! :start-after: example-begin argmax-env-determinism
2051
+ //! :end-before: example-end argmax-env-determinism
2052
+ //!
2053
+ //! @endrst
2054
+ //!
2055
+ //! @tparam InputIteratorT
2056
+ //! **[inferred]** Random-access input iterator type for reading input items
2057
+ //! (of some type `T`) @iterator
2058
+ //!
2059
+ //! @tparam ExtremumOutIteratorT
2060
+ //! **[inferred]** Output iterator type for recording maximum value
2061
+ //!
2062
+ //! @tparam IndexOutIteratorT
2063
+ //! **[inferred]** Output iterator type for recording index of the returned value
2064
+ //!
2065
+ //! @tparam EnvT
2066
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
2067
+ //!
2068
+ //! @param[in] d_in
2069
+ //! Iterator to the input sequence of data items
2070
+ //!
2071
+ //! @param[out] d_max_out
2072
+ //! Iterator to which the maximum value is written
2073
+ //!
2074
+ //! @param[out] d_index_out
2075
+ //! Iterator to which the index of the returned value is written
2076
+ //!
2077
+ //! @param[in] num_items
2078
+ //! Total number of input items (i.e., length of ``d_in``)
2079
+ //!
2080
+ //! @param[in] env
2081
+ //! @rst
2082
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
2083
+ //! @endrst
2084
+ template <typename InputIteratorT,
2085
+ typename ExtremumOutIteratorT,
2086
+ typename IndexOutIteratorT,
2087
+ typename EnvT = ::cuda::std::execution::env<>>
2088
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
2089
+ ArgMax(InputIteratorT d_in,
2090
+ ExtremumOutIteratorT d_max_out,
2091
+ IndexOutIteratorT d_index_out,
2092
+ ::cuda::std::int64_t num_items,
2093
+ EnvT env = {})
2094
+ {
2095
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
2096
+
2097
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
2098
+ "Determinism should be used inside requires to have an effect.");
2099
+ using requirements_t =
2100
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
2101
+ using requested_determinism_t =
2102
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
2103
+ _CUDA_EXEC::determinism::__get_determinism_t,
2104
+ _CUDA_EXEC::determinism::run_to_run_t>;
2105
+
2106
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
2107
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
2108
+ "gpu_to_gpu determinism is not supported");
2109
+
2110
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
2111
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
2112
+
2113
+ // Query relevant properties from the environment
2114
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
2115
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
2116
+
2117
+ void* d_temp_storage = nullptr;
2118
+ size_t temp_storage_bytes = 0;
2119
+
2120
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
2121
+
2122
+ // Reduction operation
2123
+ using ReduceOpT = cub::ArgMax;
2124
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
2125
+ using PerPartitionOffsetT = int;
2126
+ using GlobalOffsetT = ::cuda::std::int64_t;
2127
+
2128
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
2129
+ using InitT = OutputExtremumT;
2130
+
2131
+ // Initial value
2132
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
2133
+
2134
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
2135
+ auto out_it = ::cuda::make_tabulate_output_iterator(
2136
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
2137
+
2138
+ // Query the required temporary storage size
2139
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
2140
+ InputIteratorT,
2141
+ decltype(out_it),
2142
+ PerPartitionOffsetT,
2143
+ GlobalOffsetT,
2144
+ ReduceOpT,
2145
+ InitT>::Dispatch(d_temp_storage,
2146
+ temp_storage_bytes,
2147
+ d_in,
2148
+ out_it,
2149
+ static_cast<GlobalOffsetT>(num_items),
2150
+ ReduceOpT{},
2151
+ initial_value,
2152
+ stream.get());
2153
+ if (error != cudaSuccess)
2154
+ {
2155
+ return error;
2156
+ }
2157
+
2158
+ // TODO(gevtushenko): use uninitialized buffer when it's available
2159
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
2160
+ if (error != cudaSuccess)
2161
+ {
2162
+ return error;
2163
+ }
2164
+
2165
+ // Run the algorithm
2166
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
2167
+ InputIteratorT,
2168
+ decltype(out_it),
2169
+ PerPartitionOffsetT,
2170
+ GlobalOffsetT,
2171
+ ReduceOpT,
2172
+ InitT>::Dispatch(d_temp_storage,
2173
+ temp_storage_bytes,
2174
+ d_in,
2175
+ out_it,
2176
+ static_cast<GlobalOffsetT>(num_items),
2177
+ ReduceOpT{},
2178
+ initial_value,
2179
+ stream.get());
2180
+
2181
+ // Try to deallocate regardless of the error to avoid memory leaks
2182
+ cudaError_t deallocate_error =
2183
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
2184
+
2185
+ if (error != cudaSuccess)
2186
+ {
2187
+ // Reduction error takes precedence over deallocation error since it happens first
2188
+ return error;
2189
+ }
2190
+
2191
+ return deallocate_error;
2192
+ }
2193
+
2194
+ //! @rst
2195
+ //! Fuses transform and reduce operations
2196
+ //!
2197
+ //! - Does not support binary reduction operators that are non-commutative.
2198
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2199
+ //! (e.g., addition of floating point types) on the same GPU device.
2200
+ //! However, results for pseudo-associative reduction may be inconsistent
2201
+ //! from one device to a another device of a different compute-capability
2202
+ //! because CUB can employ different tile-sizing for different architectures.
2203
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
2204
+ //! - @devicestorage
2205
+ //!
2206
+ //! Snippet
2207
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2208
+ //!
2209
+ //! The code snippet below illustrates a user-defined min-reduction of a
2210
+ //! device vector of `int` data elements.
2211
+ //!
2212
+ //! .. code-block:: c++
2213
+ //!
2214
+ //! #include <cub/cub.cuh>
2215
+ //! // or equivalently <cub/device/device_reduce.cuh>
2216
+ //!
2217
+ //! thrust::device_vector<int> in = { 1, 2, 3, 4 };
2218
+ //! thrust::device_vector<int> out(1);
2219
+ //!
2220
+ //! size_t temp_storage_bytes = 0;
2221
+ //! uint8_t *d_temp_storage = nullptr;
2222
+ //!
2223
+ //! const int init = 42;
2224
+ //!
2225
+ //! cub::DeviceReduce::TransformReduce(
2226
+ //! d_temp_storage,
2227
+ //! temp_storage_bytes,
2228
+ //! in.begin(),
2229
+ //! out.begin(),
2230
+ //! in.size(),
2231
+ //! cuda::std::plus<>{},
2232
+ //! square_t{},
2233
+ //! init);
2234
+ //!
2235
+ //! thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
2236
+ //! d_temp_storage = temp_storage.data().get();
2237
+ //!
2238
+ //! cub::DeviceReduce::TransformReduce(
2239
+ //! d_temp_storage,
2240
+ //! temp_storage_bytes,
2241
+ //! in.begin(),
2242
+ //! out.begin(),
2243
+ //! in.size(),
2244
+ //! cuda::std::plus<>{},
2245
+ //! square_t{},
2246
+ //! init);
2247
+ //!
2248
+ //! // out[0] <-- 72
2249
+ //!
2250
+ //! @endrst
2251
+ //!
2252
+ //! @tparam InputIteratorT
2253
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
2254
+ //!
2255
+ //! @tparam OutputIteratorT
2256
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
2257
+ //!
2258
+ //! @tparam ReductionOpT
2259
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2260
+ //!
2261
+ //! @tparam TransformOpT
2262
+ //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
2263
+ //!
2264
+ //! @tparam T
2265
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
2266
+ //!
2267
+ //! @tparam NumItemsT
2268
+ //! **[inferred]** Type of num_items
2269
+ //!
2270
+ //! @param[in] d_temp_storage
2271
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2272
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2273
+ //!
2274
+ //! @param[in,out] temp_storage_bytes
2275
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2276
+ //!
2277
+ //! @param[in] d_in
2278
+ //! Pointer to the input sequence of data items
2279
+ //!
2280
+ //! @param[out] d_out
2281
+ //! Pointer to the output aggregate
2282
+ //!
2283
+ //! @param[in] num_items
2284
+ //! Total number of input items (i.e., length of ``d_in``)
2285
+ //!
2286
+ //! @param[in] reduction_op
2287
+ //! Binary reduction functor
2288
+ //!
2289
+ //! @param[in] transform_op
2290
+ //! Unary transform functor
2291
+ //!
2292
+ //! @param[in] init
2293
+ //! Initial value of the reduction
2294
+ //!
2295
+ //! @param[in] stream
2296
+ //! @rst
2297
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2298
+ //! @endrst
2299
+ template <typename InputIteratorT,
2300
+ typename OutputIteratorT,
2301
+ typename ReductionOpT,
2302
+ typename TransformOpT,
2303
+ typename T,
2304
+ typename NumItemsT>
2305
+ CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
2306
+ void* d_temp_storage,
2307
+ size_t& temp_storage_bytes,
2308
+ InputIteratorT d_in,
2309
+ OutputIteratorT d_out,
2310
+ NumItemsT num_items,
2311
+ ReductionOpT reduction_op,
2312
+ TransformOpT transform_op,
2313
+ T init,
2314
+ cudaStream_t stream = 0)
2315
+ {
2316
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
2317
+
2318
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2319
+
2320
+ return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
2321
+ d_temp_storage,
2322
+ temp_storage_bytes,
2323
+ d_in,
2324
+ d_out,
2325
+ static_cast<OffsetT>(num_items),
2326
+ reduction_op,
2327
+ init,
2328
+ stream,
2329
+ transform_op);
2330
+ }
2331
+
2332
+ //! @rst
2333
+ //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
2334
+ //!
2335
+ //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
2336
+ //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
2337
+ //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
2338
+ //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
2339
+ //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
2340
+ //!
2341
+ //! - The ``==`` equality operator is used to determine whether keys are equivalent
2342
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2343
+ //! (e.g., addition of floating point types) on the same GPU device.
2344
+ //! However, results for pseudo-associative reduction may be inconsistent
2345
+ //! from one device to a another device of a different compute-capability
2346
+ //! because CUB can employ different tile-sizing for different architectures.
2347
+ //! - Let ``out`` be any of
2348
+ //! ``[d_unique_out, d_unique_out + *d_num_runs_out)``
2349
+ //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
2350
+ //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
2351
+ //! ``[d_keys_in, d_keys_in + num_items)``,
2352
+ //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
2353
+ //! - @devicestorage
2354
+ //!
2355
+ //! Snippet
2356
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2357
+ //!
2358
+ //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
2359
+ //! associated ``int`` keys.
2360
+ //!
2361
+ //! .. code-block:: c++
2362
+ //!
2363
+ //! #include <cub/cub.cuh>
2364
+ //! // or equivalently <cub/device/device_reduce.cuh>
2365
+ //!
2366
+ //! // CustomMin functor
2367
+ //! struct CustomMin
2368
+ //! {
2369
+ //! template <typename T>
2370
+ //! __device__ __forceinline__
2371
+ //! T operator()(const T &a, const T &b) const {
2372
+ //! return (b < a) ? b : a;
2373
+ //! }
2374
+ //! };
2375
+ //!
2376
+ //! // Declare, allocate, and initialize device-accessible pointers
2377
+ //! // for input and output
2378
+ //! int num_items; // e.g., 8
2379
+ //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
2380
+ //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
2381
+ //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
2382
+ //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
2383
+ //! int *d_num_runs_out; // e.g., [-]
2384
+ //! CustomMin reduction_op;
2385
+ //! ...
2386
+ //!
2387
+ //! // Determine temporary device storage requirements
2388
+ //! void *d_temp_storage = nullptr;
2389
+ //! size_t temp_storage_bytes = 0;
2390
+ //! cub::DeviceReduce::ReduceByKey(
2391
+ //! d_temp_storage, temp_storage_bytes,
2392
+ //! d_keys_in, d_unique_out, d_values_in,
2393
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2394
+ //!
2395
+ //! // Allocate temporary storage
2396
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2397
+ //!
2398
+ //! // Run reduce-by-key
2399
+ //! cub::DeviceReduce::ReduceByKey(
2400
+ //! d_temp_storage, temp_storage_bytes,
2401
+ //! d_keys_in, d_unique_out, d_values_in,
2402
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2403
+ //!
2404
+ //! // d_unique_out <-- [0, 2, 9, 5, 8]
2405
+ //! // d_aggregates_out <-- [0, 1, 6, 2, 4]
2406
+ //! // d_num_runs_out <-- [5]
2407
+ //!
2408
+ //! @endrst
2409
+ //!
2410
+ //! @tparam KeysInputIteratorT
2411
+ //! **[inferred]** Random-access input iterator type for reading input keys @iterator
2412
+ //!
2413
+ //! @tparam UniqueOutputIteratorT
2414
+ //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator
2415
+ //!
2416
+ //! @tparam ValuesInputIteratorT
2417
+ //! **[inferred]** Random-access input iterator type for reading input values @iterator
2418
+ //!
2419
+ //! @tparam AggregatesOutputIterator
2420
+ //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
2421
+ //!
2422
+ //! @tparam NumRunsOutputIteratorT
2423
+ //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator
2424
+ //!
2425
+ //! @tparam ReductionOpT
2426
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2427
+ //!
2428
+ //! @tparam NumItemsT
2429
+ //! **[inferred]** Type of num_items
2430
+ //!
2431
+ //! @param[in] d_temp_storage
2432
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2433
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2434
+ //!
2435
+ //! @param[in,out] temp_storage_bytes
2436
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2437
+ //!
2438
+ //! @param[in] d_keys_in
2439
+ //! Pointer to the input sequence of keys
2440
+ //!
2441
+ //! @param[out] d_unique_out
2442
+ //! Pointer to the output sequence of unique keys (one key per run)
2443
+ //!
2444
+ //! @param[in] d_values_in
2445
+ //! Pointer to the input sequence of corresponding values
2446
+ //!
2447
+ //! @param[out] d_aggregates_out
2448
+ //! Pointer to the output sequence of value aggregates
2449
+ //! (one aggregate per run)
2450
+ //!
2451
+ //! @param[out] d_num_runs_out
2452
+ //! Pointer to total number of runs encountered
2453
+ //! (i.e., the length of ``d_unique_out``)
2454
+ //!
2455
+ //! @param[in] reduction_op
2456
+ //! Binary reduction functor
2457
+ //!
2458
+ //! @param[in] num_items
2459
+ //! Total number of associated key+value pairs
2460
+ //! (i.e., the length of ``d_in_keys`` and ``d_in_values``)
2461
+ //!
2462
+ //! @param[in] stream
2463
+ //! @rst
2464
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2465
+ //! @endrst
2466
+ template <typename KeysInputIteratorT,
2467
+ typename UniqueOutputIteratorT,
2468
+ typename ValuesInputIteratorT,
2469
+ typename AggregatesOutputIteratorT,
2470
+ typename NumRunsOutputIteratorT,
2471
+ typename ReductionOpT,
2472
+ typename NumItemsT>
2473
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
2474
+ void* d_temp_storage,
2475
+ size_t& temp_storage_bytes,
2476
+ KeysInputIteratorT d_keys_in,
2477
+ UniqueOutputIteratorT d_unique_out,
2478
+ ValuesInputIteratorT d_values_in,
2479
+ AggregatesOutputIteratorT d_aggregates_out,
2480
+ NumRunsOutputIteratorT d_num_runs_out,
2481
+ ReductionOpT reduction_op,
2482
+ NumItemsT num_items,
2483
+ cudaStream_t stream = 0)
2484
+ {
2485
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
2486
+
2487
+ // Signed integer type for global offsets
2488
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2489
+
2490
+ // FlagT iterator type (not used)
2491
+
2492
+ // Selection op (not used)
2493
+
2494
+ // Default == operator
2495
+ using EqualityOp = ::cuda::std::equal_to<>;
2496
+
2497
+ return DispatchReduceByKey<
2498
+ KeysInputIteratorT,
2499
+ UniqueOutputIteratorT,
2500
+ ValuesInputIteratorT,
2501
+ AggregatesOutputIteratorT,
2502
+ NumRunsOutputIteratorT,
2503
+ EqualityOp,
2504
+ ReductionOpT,
2505
+ OffsetT>::Dispatch(d_temp_storage,
2506
+ temp_storage_bytes,
2507
+ d_keys_in,
2508
+ d_unique_out,
2509
+ d_values_in,
2510
+ d_aggregates_out,
2511
+ d_num_runs_out,
2512
+ EqualityOp(),
2513
+ reduction_op,
2514
+ static_cast<OffsetT>(num_items),
2515
+ stream);
2516
+ }
2517
+ };
2518
+ CUB_NAMESPACE_END