cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1962) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +275 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1185 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +927 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +232 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +289 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +706 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +558 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1127 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +585 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +477 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1120 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +609 -0
  43. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  44. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  45. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  46. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  47. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  48. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1308 -0
  49. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  50. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  51. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1220 -0
  53. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2194 -0
  54. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  55. cuda/cccl/headers/include/cub/block/block_reduce.cuh +666 -0
  56. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  57. cuda/cccl/headers/include/cub/block/block_scan.cuh +2584 -0
  58. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  59. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  60. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  67. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  68. cuda/cccl/headers/include/cub/config.cuh +53 -0
  69. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  70. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  71. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  72. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  73. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  74. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  75. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  76. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  77. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  78. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  79. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  85. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  86. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  87. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  88. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  89. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  90. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  91. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  92. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  93. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  94. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  95. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  96. cuda/cccl/headers/include/cub/device/device_for.cuh +985 -0
  97. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  98. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  99. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  100. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  101. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  102. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  103. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2519 -0
  104. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  105. cuda/cccl/headers/include/cub/device/device_scan.cuh +2205 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  107. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1520 -0
  108. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  109. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  110. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  111. cuda/cccl/headers/include/cub/device/device_transform.cuh +637 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +111 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +304 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +474 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1753 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1327 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +536 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +314 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +500 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +917 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +342 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +561 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +226 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +578 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +192 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +324 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1009 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +79 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +676 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +621 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  160. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  161. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +443 -0
  162. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  163. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +454 -0
  164. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  165. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  166. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  167. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  168. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  169. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  170. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  171. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  172. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  173. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  174. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +541 -0
  175. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  176. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  177. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  178. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  179. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  180. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  181. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  182. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  183. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  184. cuda/cccl/headers/include/cub/util_device.cuh +784 -0
  185. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  186. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  187. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  188. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  189. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  190. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  191. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  192. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  193. cuda/cccl/headers/include/cub/version.cuh +89 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  195. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  196. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +736 -0
  197. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +407 -0
  198. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  199. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  200. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  201. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  202. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  203. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +824 -0
  204. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1886 -0
  205. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  206. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  207. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  208. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  209. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  212. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  213. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  214. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  215. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  216. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  217. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  218. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  220. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  221. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +468 -0
  222. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  223. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  224. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  225. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  226. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  227. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  228. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  229. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  230. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  231. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  232. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  233. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  234. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  235. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  236. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  237. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  238. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  239. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  240. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  241. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  242. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  243. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  244. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  245. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +185 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +541 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  254. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  255. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  256. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  257. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  258. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  259. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  260. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  261. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  262. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  263. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  264. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  265. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  266. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  267. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +300 -0
  268. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  269. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  270. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  271. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  272. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +386 -0
  273. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +344 -0
  274. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +498 -0
  275. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +501 -0
  276. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +461 -0
  277. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  278. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +673 -0
  279. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  280. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  281. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  282. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  283. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  286. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  287. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  288. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  289. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  290. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  291. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  292. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  293. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  294. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  295. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  296. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  297. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  298. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  299. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  300. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  301. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  302. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  303. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  304. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  308. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  309. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  310. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  311. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  313. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  424. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  425. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  426. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  427. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  428. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  429. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  430. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  431. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  432. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  433. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  455. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  456. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  457. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  458. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  459. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  460. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  461. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  462. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  463. cuda/cccl/headers/include/cuda/access_property +26 -0
  464. cuda/cccl/headers/include/cuda/algorithm +27 -0
  465. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  466. cuda/cccl/headers/include/cuda/atomic +27 -0
  467. cuda/cccl/headers/include/cuda/barrier +267 -0
  468. cuda/cccl/headers/include/cuda/bit +29 -0
  469. cuda/cccl/headers/include/cuda/cmath +36 -0
  470. cuda/cccl/headers/include/cuda/devices +20 -0
  471. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  472. cuda/cccl/headers/include/cuda/functional +32 -0
  473. cuda/cccl/headers/include/cuda/iterator +38 -0
  474. cuda/cccl/headers/include/cuda/latch +27 -0
  475. cuda/cccl/headers/include/cuda/mdspan +28 -0
  476. cuda/cccl/headers/include/cuda/memory +34 -0
  477. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  478. cuda/cccl/headers/include/cuda/numeric +29 -0
  479. cuda/cccl/headers/include/cuda/pipeline +579 -0
  480. cuda/cccl/headers/include/cuda/ptx +128 -0
  481. cuda/cccl/headers/include/cuda/semaphore +31 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  600. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  601. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  602. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  606. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +676 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1284 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  641. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  642. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  643. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  651. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/complex.h +674 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  719. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  722. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  723. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  724. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  725. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  726. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  727. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  728. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  729. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  730. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  731. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  732. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  734. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1956 -0
  735. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  736. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  737. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  754. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  755. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  761. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  762. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  767. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  768. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  769. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  770. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  771. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  772. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  773. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/function.h +1278 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +268 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +66 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  821. cuda/cccl/headers/include/cuda/std/__internal/features.h +77 -0
  822. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  860. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  861. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  862. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  866. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  867. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +144 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +758 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +497 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  878. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  879. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +532 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  901. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  902. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  903. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  904. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  905. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  918. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  924. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  925. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  926. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  927. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  928. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  929. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  930. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  931. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  932. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  962. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  964. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  965. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  966. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  967. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  969. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  974. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +80 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +64 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  985. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +162 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +106 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/pair.h +796 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1148. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1149. cuda/cccl/headers/include/cuda/std/array +518 -0
  1150. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1151. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1152. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1153. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1154. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1155. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1156. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1157. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1158. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1159. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1160. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1161. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1162. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1164. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1165. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1166. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +204 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1174. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1175. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1176. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1177. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1178. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1179. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1180. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1181. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1182. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1183. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1184. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1185. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1186. cuda/cccl/headers/include/cuda/std/numbers +341 -0
  1187. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1188. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1189. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1190. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1191. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1192. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1193. cuda/cccl/headers/include/cuda/std/span +628 -0
  1194. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1195. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1196. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1197. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1198. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1199. cuda/cccl/headers/include/cuda/std/version +243 -0
  1200. cuda/cccl/headers/include/cuda/stream +31 -0
  1201. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1202. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1203. cuda/cccl/headers/include/cuda/utility +27 -0
  1204. cuda/cccl/headers/include/cuda/version +16 -0
  1205. cuda/cccl/headers/include/cuda/warp +28 -0
  1206. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1207. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1208. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1209. cuda/cccl/headers/include/nv/target +235 -0
  1210. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1211. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1212. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1213. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1214. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1215. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1216. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1217. cuda/cccl/headers/include/thrust/count.h +245 -0
  1218. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1219. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1220. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1230. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1231. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1232. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1233. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1234. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1235. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1236. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1237. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1238. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1239. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1240. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1252. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1253. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1254. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1255. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1256. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1257. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1258. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1259. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1260. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1261. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1262. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1263. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1264. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1265. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1266. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1267. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1268. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1269. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1271. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1272. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1273. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1274. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1275. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1276. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1277. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1278. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1279. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1280. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1281. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1282. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1283. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1284. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1285. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1286. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1287. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1288. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1289. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1290. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1291. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1292. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1293. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1294. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1295. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1296. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1297. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1298. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1299. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1301. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1302. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1303. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1304. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1305. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1306. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1307. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1308. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1309. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1310. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1311. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1312. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1313. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1314. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1315. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1316. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1317. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1318. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1319. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1320. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1321. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1322. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1323. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1324. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1325. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1326. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1327. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1328. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1329. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1330. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1331. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1332. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1333. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1334. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1335. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1336. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1337. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1338. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1339. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1340. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1341. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1342. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1343. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1344. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1345. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1346. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1347. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1348. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1349. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1350. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1351. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1352. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1353. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1354. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1355. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1356. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1357. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1358. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1359. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1360. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1361. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1362. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1363. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1364. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1365. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1366. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1367. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1368. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1369. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1370. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1371. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1372. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1373. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1374. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1375. cuda/cccl/headers/include/thrust/find.h +382 -0
  1376. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1377. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1378. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1379. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1380. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1381. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1382. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1383. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1384. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1385. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1386. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1387. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1388. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1389. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1390. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1391. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1392. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1393. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1394. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1395. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1396. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1397. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1398. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1399. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1400. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1401. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1402. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +323 -0
  1403. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1404. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1405. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1406. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1407. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1408. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1409. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1410. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1411. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1412. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1413. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1414. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1415. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1416. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1417. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1418. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1419. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1420. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1421. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1422. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1423. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1424. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1425. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1426. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1427. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1428. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1429. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1430. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1431. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1432. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1433. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1434. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1435. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1436. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1437. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1438. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1439. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1440. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1441. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1442. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1443. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1444. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1445. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1446. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1447. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1448. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1449. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1450. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1451. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1452. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1453. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1454. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1455. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1456. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1457. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1458. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1459. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1460. cuda/cccl/headers/include/thrust/random.h +120 -0
  1461. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1462. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1463. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1464. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1465. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1466. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1467. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1468. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1469. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1470. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1471. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +240 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +470 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1772. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1838. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1902. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1903. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1904. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1905. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1906. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1907. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1908. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1909. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1910. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1911. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1912. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1913. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1914. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1915. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1916. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1917. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1918. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1919. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1920. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1921. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1922. cuda/cccl/headers/include/thrust/version.h +93 -0
  1923. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1924. cuda/cccl/headers/include_paths.py +51 -0
  1925. cuda/cccl/parallel/__init__.py +9 -0
  1926. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1927. cuda/cccl/parallel/experimental/__init__.py +73 -0
  1928. cuda/cccl/parallel/experimental/_bindings.py +79 -0
  1929. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1930. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1984 -0
  1931. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1932. cuda/cccl/parallel/experimental/_cccl_interop.py +422 -0
  1933. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1934. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1935. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1936. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1937. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1938. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1939. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1940. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1941. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1942. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1943. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1944. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1945. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1946. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  1947. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1948. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  1949. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1950. cuda/cccl/parallel/experimental/iterators/__init__.py +19 -0
  1951. cuda/cccl/parallel/experimental/iterators/_factories.py +191 -0
  1952. cuda/cccl/parallel/experimental/iterators/_iterators.py +612 -0
  1953. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +199 -0
  1954. cuda/cccl/parallel/experimental/numba_utils.py +53 -0
  1955. cuda/cccl/parallel/experimental/op.py +3 -0
  1956. cuda/cccl/parallel/experimental/struct.py +272 -0
  1957. cuda/cccl/parallel/experimental/typing.py +35 -0
  1958. cuda/cccl/py.typed +0 -0
  1959. cuda_cccl-0.1.3.2.0.dev438.dist-info/METADATA +42 -0
  1960. cuda_cccl-0.1.3.2.0.dev438.dist-info/RECORD +1962 -0
  1961. cuda_cccl-0.1.3.2.0.dev438.dist-info/WHEEL +5 -0
  1962. cuda_cccl-0.1.3.2.0.dev438.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2519 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/detail/device_memory_resource.cuh>
47
+ #include <cub/detail/temporary_storage.cuh>
48
+ #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
49
+ #include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
50
+ #include <cub/device/dispatch/dispatch_reduce_nondeterministic.cuh>
51
+ #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
52
+ #include <cub/thread/thread_operators.cuh>
53
+ #include <cub/util_type.cuh>
54
+
55
+ #include <thrust/iterator/tabulate_output_iterator.h>
56
+
57
+ #include <cuda/__execution/determinism.h>
58
+ #include <cuda/__execution/require.h>
59
+ #include <cuda/__execution/tune.h>
60
+ #include <cuda/__functional/maximum.h>
61
+ #include <cuda/__functional/minimum.h>
62
+ #include <cuda/__memory_resource/get_memory_resource.h>
63
+ #include <cuda/__stream/get_stream.h>
64
+ #include <cuda/std/__execution/env.h>
65
+ #include <cuda/std/__functional/identity.h>
66
+ #include <cuda/std/__functional/invoke.h>
67
+ #include <cuda/std/__functional/operations.h>
68
+ #include <cuda/std/__type_traits/conditional.h>
69
+ #include <cuda/std/__type_traits/is_integral.h>
70
+ #include <cuda/std/__type_traits/is_same.h>
71
+ #include <cuda/std/cstdint>
72
+ #include <cuda/std/limits>
73
+ #include <cuda/stream_ref>
74
+
75
+ CUB_NAMESPACE_BEGIN
76
+
77
+ namespace detail
78
+ {
79
+
80
+ template <typename DeterminismT>
81
+ inline constexpr bool is_non_deterministic_v =
82
+ ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
83
+
84
+ namespace reduce
85
+ {
86
+
87
+ struct get_tuning_query_t
88
+ {};
89
+
90
+ template <class Derived>
91
+ struct tuning
92
+ {
93
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
94
+ {
95
+ return static_cast<const Derived&>(*this);
96
+ }
97
+ };
98
+
99
+ struct default_tuning : tuning<default_tuning>
100
+ {
101
+ template <class AccumT, class Offset, class OpT>
102
+ using fn = policy_hub<AccumT, Offset, OpT>;
103
+ };
104
+
105
+ struct default_rfa_tuning : tuning<default_tuning>
106
+ {
107
+ template <class AccumT, class Offset, class OpT>
108
+ using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
109
+ };
110
+
111
+ template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
112
+ struct unzip_and_write_arg_extremum_op
113
+ {
114
+ ExtremumOutIteratorT result_out_it;
115
+ IndexOutIteratorT index_out_it;
116
+
117
+ template <typename IndexT, typename KeyValuePairT>
118
+ _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
119
+ {
120
+ *result_out_it = reduced_result.value;
121
+ *index_out_it = reduced_result.key;
122
+ }
123
+ };
124
+ } // namespace reduce
125
+ } // namespace detail
126
+
127
+ //! @rst
128
+ //! DeviceReduce provides device-wide, parallel operations for computing
129
+ //! a reduction across a sequence of data items residing within
130
+ //! device-accessible memory.
131
+ //!
132
+ //! .. image:: ../../img/reduce_logo.png
133
+ //! :align: center
134
+ //!
135
+ //! Overview
136
+ //! ====================================
137
+ //!
138
+ //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
139
+ //! (or *fold*) uses a binary combining operator to compute a single aggregate
140
+ //! from a sequence of input elements.
141
+ //!
142
+ //! Usage Considerations
143
+ //! ====================================
144
+ //!
145
+ //! @cdp_class{DeviceReduce}
146
+ //!
147
+ //! Performance
148
+ //! ====================================
149
+ //!
150
+ //! @linear_performance{reduction, reduce-by-key, and run-length encode}
151
+ //!
152
+ //! @endrst
153
+ struct DeviceReduce
154
+ {
155
+ private:
156
+ template <typename TuningEnvT,
157
+ typename InputIteratorT,
158
+ typename OutputIteratorT,
159
+ typename ReductionOpT,
160
+ typename TransformOpT,
161
+ typename T,
162
+ typename NumItemsT,
163
+ ::cuda::execution::determinism::__determinism_t Determinism>
164
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
165
+ void* d_temp_storage,
166
+ size_t& temp_storage_bytes,
167
+ InputIteratorT d_in,
168
+ OutputIteratorT d_out,
169
+ NumItemsT num_items,
170
+ ReductionOpT reduction_op,
171
+ TransformOpT transform_op,
172
+ T init,
173
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
174
+ cudaStream_t stream)
175
+ {
176
+ using offset_t = detail::choose_offset_t<NumItemsT>;
177
+ using reduce_tuning_t = ::cuda::std::execution::
178
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
179
+
180
+ using accum_t = ::cuda::std::
181
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
182
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
183
+
184
+ using dispatch_t =
185
+ DispatchTransformReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, TransformOpT, T, accum_t, policy_t>;
186
+
187
+ return dispatch_t::Dispatch(
188
+ d_temp_storage,
189
+ temp_storage_bytes,
190
+ d_in,
191
+ d_out,
192
+ static_cast<offset_t>(num_items),
193
+ reduction_op,
194
+ init,
195
+ stream,
196
+ transform_op);
197
+ }
198
+
199
+ template <typename TuningEnvT,
200
+ typename InputIteratorT,
201
+ typename OutputIteratorT,
202
+ typename ReductionOpT,
203
+ typename TransformOpT,
204
+ typename T,
205
+ typename NumItemsT>
206
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
207
+ void* d_temp_storage,
208
+ size_t& temp_storage_bytes,
209
+ InputIteratorT d_in,
210
+ OutputIteratorT d_out,
211
+ NumItemsT num_items,
212
+ ReductionOpT,
213
+ TransformOpT transform_op,
214
+ T init,
215
+ ::cuda::execution::determinism::gpu_to_gpu_t,
216
+ cudaStream_t stream)
217
+ {
218
+ using offset_t = detail::choose_offset_t<NumItemsT>;
219
+
220
+ using reduce_tuning_t = ::cuda::std::execution::
221
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
222
+
223
+ using accum_t = ::cuda::std::
224
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
225
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
226
+ using dispatch_t =
227
+ detail::DispatchReduceDeterministic<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>;
228
+
229
+ return dispatch_t::Dispatch(
230
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
231
+ }
232
+
233
+ template <typename TuningEnvT,
234
+ typename InputIteratorT,
235
+ typename OutputIteratorT,
236
+ typename ReductionOpT,
237
+ typename TransformOpT,
238
+ typename T,
239
+ typename NumItemsT>
240
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
241
+ void* d_temp_storage,
242
+ size_t& temp_storage_bytes,
243
+ InputIteratorT d_in,
244
+ OutputIteratorT d_out,
245
+ NumItemsT num_items,
246
+ ReductionOpT reduction_op,
247
+ TransformOpT transform_op,
248
+ T init,
249
+ ::cuda::execution::determinism::not_guaranteed_t,
250
+ cudaStream_t stream)
251
+ {
252
+ using offset_t = detail::choose_offset_t<NumItemsT>;
253
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
254
+
255
+ using output_t = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator_t<OutputIteratorT>;
256
+
257
+ using reduce_tuning_t = ::cuda::std::execution::
258
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
259
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
260
+ using dispatch_t = detail::
261
+ DispatchReduceNondeterministic<InputIteratorT, output_t, offset_t, ReductionOpT, T, accum_t, TransformOpT, policy_t>;
262
+
263
+ return dispatch_t::Dispatch(
264
+ d_temp_storage,
265
+ temp_storage_bytes,
266
+ d_in,
267
+ THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(d_out),
268
+ static_cast<offset_t>(num_items),
269
+ reduction_op,
270
+ init,
271
+ stream,
272
+ transform_op);
273
+ }
274
+
275
+ public:
276
+ //! @rst
277
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
278
+ //!
279
+ //! - Does not support binary reduction operators that are non-commutative.
280
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
281
+ //! (e.g., addition of floating point types) on the same GPU device.
282
+ //! However, results for pseudo-associative reduction may be inconsistent
283
+ //! from one device to a another device of a different compute-capability
284
+ //! because CUB can employ different tile-sizing for different architectures.
285
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
286
+ //! - @devicestorage
287
+ //!
288
+ //! Snippet
289
+ //! +++++++++++++++++++++++++++++++++++++++++++++
290
+ //!
291
+ //! The code snippet below illustrates a user-defined min-reduction of a
292
+ //! device vector of ``int`` data elements.
293
+ //!
294
+ //! .. code-block:: c++
295
+ //!
296
+ //! #include <cub/cub.cuh>
297
+ //! // or equivalently <cub/device/device_reduce.cuh>
298
+ //!
299
+ //! // CustomMin functor
300
+ //! struct CustomMin
301
+ //! {
302
+ //! template <typename T>
303
+ //! __device__ __forceinline__
304
+ //! T operator()(const T &a, const T &b) const {
305
+ //! return (b < a) ? b : a;
306
+ //! }
307
+ //! };
308
+ //!
309
+ //! // Declare, allocate, and initialize device-accessible pointers for
310
+ //! // input and output
311
+ //! int num_items; // e.g., 7
312
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
313
+ //! int *d_out; // e.g., [-]
314
+ //! CustomMin min_op;
315
+ //! int init; // e.g., INT_MAX
316
+ //! ...
317
+ //!
318
+ //! // Determine temporary device storage requirements
319
+ //! void *d_temp_storage = nullptr;
320
+ //! size_t temp_storage_bytes = 0;
321
+ //! cub::DeviceReduce::Reduce(
322
+ //! d_temp_storage, temp_storage_bytes,
323
+ //! d_in, d_out, num_items, min_op, init);
324
+ //!
325
+ //! // Allocate temporary storage
326
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
327
+ //!
328
+ //! // Run reduction
329
+ //! cub::DeviceReduce::Reduce(
330
+ //! d_temp_storage, temp_storage_bytes,
331
+ //! d_in, d_out, num_items, min_op, init);
332
+ //!
333
+ //! // d_out <-- [0]
334
+ //!
335
+ //! @endrst
336
+ //!
337
+ //! @tparam InputIteratorT
338
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
339
+ //!
340
+ //! @tparam OutputIteratorT
341
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
342
+ //!
343
+ //! @tparam ReductionOpT
344
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
345
+ //!
346
+ //! @tparam T
347
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
348
+ //!
349
+ //! @tparam NumItemsT
350
+ //! **[inferred]** Type of num_items
351
+ //!
352
+ //! @param[in] d_temp_storage
353
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
354
+ //! required allocation size is written to `temp_storage_bytes` and no work
355
+ //! is done.
356
+ //!
357
+ //! @param[in,out] temp_storage_bytes
358
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
359
+ //!
360
+ //! @param[in] d_in
361
+ //! Pointer to the input sequence of data items
362
+ //!
363
+ //! @param[out] d_out
364
+ //! Pointer to the output aggregate
365
+ //!
366
+ //! @param[in] num_items
367
+ //! Total number of input items (i.e., length of ``d_in``)
368
+ //!
369
+ //! @param[in] reduction_op
370
+ //! Binary reduction functor
371
+ //!
372
+ //! @param[in] init
373
+ //! Initial value of the reduction
374
+ //!
375
+ //! @param[in] stream
376
+ //! @rst
377
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
378
+ //! @endrst
379
+ template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
380
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
381
+ void* d_temp_storage,
382
+ size_t& temp_storage_bytes,
383
+ InputIteratorT d_in,
384
+ OutputIteratorT d_out,
385
+ NumItemsT num_items,
386
+ ReductionOpT reduction_op,
387
+ T init,
388
+ cudaStream_t stream = 0)
389
+ {
390
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
391
+
392
+ // Signed integer type for global offsets
393
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
394
+
395
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
396
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
397
+ }
398
+
399
+ //! @rst
400
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
401
+ //!
402
+ //! - Does not support binary reduction operators that are non-commutative.
403
+ //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
404
+ //! (e.g., addition of floating point types) on the same GPU device.
405
+ //! However, results for pseudo-associative reduction may be inconsistent
406
+ //! from one device to a another device of a different compute-capability
407
+ //! because CUB can employ different tile-sizing for different architectures.
408
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
409
+ //! as the `env` parameter.
410
+ //! To request "not-guaranteed" determinism, pass
411
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
412
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
413
+ //!
414
+ //! Snippet
415
+ //! +++++++++++++++++++++++++++++++++++++++++++++
416
+ //!
417
+ //! The code snippet below illustrates a user-defined min-reduction of a
418
+ //! device vector of ``int`` data elements.
419
+ //!
420
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
421
+ //! :language: c++
422
+ //! :dedent:
423
+ //! :start-after: example-begin reduce-env-determinism
424
+ //! :end-before: example-end reduce-env-determinism
425
+ //!
426
+ //! @endrst
427
+ //!
428
+ //! @tparam InputIteratorT
429
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
430
+ //!
431
+ //! @tparam OutputIteratorT
432
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
433
+ //!
434
+ //! @tparam ReductionOpT
435
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
436
+ //!
437
+ //! @tparam T
438
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
439
+ //!
440
+ //! @tparam NumItemsT
441
+ //! **[inferred]** Type of num_items
442
+ //!
443
+ //! @tparam EnvT
444
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
445
+ //!
446
+ //! @param[in] d_in
447
+ //! Pointer to the input sequence of data items
448
+ //!
449
+ //! @param[out] d_out
450
+ //! Pointer to the output aggregate
451
+ //!
452
+ //! @param[in] num_items
453
+ //! Total number of input items (i.e., length of ``d_in``)
454
+ //!
455
+ //! @param[in] reduction_op
456
+ //! Binary reduction functor
457
+ //!
458
+ //! @param[in] init
459
+ //! Initial value of the reduction
460
+ //!
461
+ //! @param[in] env
462
+ //! @rst
463
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
464
+ //! @endrst
465
+ template <typename InputIteratorT,
466
+ typename OutputIteratorT,
467
+ typename ReductionOpT,
468
+ typename T,
469
+ typename NumItemsT,
470
+ typename EnvT = ::cuda::std::execution::env<>>
471
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
472
+ InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
473
+ {
474
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
475
+
476
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
477
+ "Determinism should be used inside requires to have an effect.");
478
+ using requirements_t = ::cuda::std::execution::
479
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
480
+ using default_determinism_t =
481
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
482
+ ::cuda::execution::determinism::__get_determinism_t,
483
+ ::cuda::execution::determinism::run_to_run_t>;
484
+
485
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
486
+
487
+ constexpr auto gpu_gpu_determinism =
488
+ ::cuda::std::is_same_v<default_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>;
489
+
490
+ // integral types are always gpu-to-gpu deterministic if reduction operator is a simple cuda binary
491
+ // operator, so fallback to run-to-run determinism
492
+ constexpr auto integral_fallback =
493
+ gpu_gpu_determinism && ::cuda::std::is_integral_v<accum_t> && (detail::is_cuda_binary_operator<ReductionOpT>);
494
+
495
+ // use gpu-to-gpu determinism only for float and double types with ::cuda::std::plus operator
496
+ constexpr auto float_double_plus =
497
+ gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
498
+
499
+ constexpr auto supported = integral_fallback || float_double_plus || !gpu_gpu_determinism;
500
+
501
+ // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
502
+ // float and double types with ::cuda::std::plus operator
503
+ static_assert(supported, "gpu_to_gpu determinism is unsupported");
504
+
505
+ if constexpr (!supported)
506
+ {
507
+ return cudaErrorNotSupported;
508
+ }
509
+ else
510
+ {
511
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
512
+
513
+ // Certain conditions must be met to be able to use the non-deterministic
514
+ // kernel. The output iterator must be a contiguous iterator and the
515
+ // reduction operator must be plus (for now). Additionally, since atomics for types of
516
+ // size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
517
+ // determinism.
518
+ constexpr auto is_contiguous_fallback =
519
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
520
+ constexpr auto is_plus_fallback = !no_determinism || detail::is_cuda_std_plus_v<ReductionOpT>;
521
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(accum_t) >= 4;
522
+
523
+ // If the conditions for gpu-to-gpu determinism or non-deterministic
524
+ // reduction are not met, we fall back to run-to-run determinism.
525
+ using determinism_t = ::cuda::std::conditional_t<
526
+ (gpu_gpu_determinism && integral_fallback)
527
+ || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
528
+ ::cuda::execution::determinism::run_to_run_t,
529
+ default_determinism_t>;
530
+
531
+ // Query relevant properties from the environment
532
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
533
+ auto mr =
534
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
535
+
536
+ void* d_temp_storage = nullptr;
537
+ size_t temp_storage_bytes = 0;
538
+
539
+ using tuning_t = ::cuda::std::execution::
540
+ __query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
541
+
542
+ // Query the required temporary storage size
543
+ cudaError_t error = reduce_impl<tuning_t>(
544
+ d_temp_storage,
545
+ temp_storage_bytes,
546
+ d_in,
547
+ d_out,
548
+ num_items,
549
+ reduction_op,
550
+ ::cuda::std::identity{},
551
+ init,
552
+ determinism_t{},
553
+ stream.get());
554
+ if (error != cudaSuccess)
555
+ {
556
+ return error;
557
+ }
558
+
559
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
560
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
561
+ if (error != cudaSuccess)
562
+ {
563
+ return error;
564
+ }
565
+
566
+ // Run the algorithm
567
+ error = reduce_impl<tuning_t>(
568
+ d_temp_storage,
569
+ temp_storage_bytes,
570
+ d_in,
571
+ d_out,
572
+ num_items,
573
+ reduction_op,
574
+ ::cuda::std::identity{},
575
+ init,
576
+ determinism_t{},
577
+ stream.get());
578
+
579
+ // Try to deallocate regardless of the error to avoid memory leaks
580
+ cudaError_t deallocate_error =
581
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
582
+
583
+ if (error != cudaSuccess)
584
+ {
585
+ // Reduction error takes precedence over deallocation error since it happens first
586
+ return error;
587
+ }
588
+
589
+ return deallocate_error;
590
+ }
591
+ }
592
+
593
+ //! @rst
594
+ //! Computes a device-wide sum using the addition (``+``) operator.
595
+ //!
596
+ //! - Uses ``0`` as the initial value of the reduction.
597
+ //! - Does not support ``+`` operators that are non-commutative.
598
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
599
+ //! (e.g., addition of floating point types) on the same GPU device.
600
+ //! However, results for pseudo-associative reduction may be inconsistent
601
+ //! from one device to a another device of a different compute-capability
602
+ //! because CUB can employ different tile-sizing for different architectures.
603
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
604
+ //! as the `env` parameter.
605
+ //! To request "not-guaranteed" determinism, pass
606
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
607
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
608
+ //!
609
+ //! Snippet
610
+ //! +++++++++++++++++++++++++++++++++++++++++++++
611
+ //!
612
+ //! The code snippet below illustrates a user-defined min-reduction of a
613
+ //! device vector of ``int`` data elements.
614
+ //!
615
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
616
+ //! :language: c++
617
+ //! :dedent:
618
+ //! :start-after: example-begin sum-env-determinism
619
+ //! :end-before: example-end sum-env-determinism
620
+ //!
621
+ //! @endrst
622
+ //!
623
+ //! @tparam InputIteratorT
624
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
625
+ //!
626
+ //! @tparam OutputIteratorT
627
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
628
+ //!
629
+ //! @tparam NumItemsT
630
+ //! **[inferred]** Type of num_items
631
+ //!
632
+ //! @tparam EnvT
633
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
634
+ //!
635
+ //! @param[in] d_in
636
+ //! Pointer to the input sequence of data items
637
+ //!
638
+ //! @param[out] d_out
639
+ //! Pointer to the output aggregate
640
+ //!
641
+ //! @param[in] num_items
642
+ //! Total number of input items (i.e., length of ``d_in``)
643
+ //!
644
+ //! @param[in] env
645
+ //! @rst
646
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
647
+ //! @endrst
648
+ template <typename InputIteratorT,
649
+ typename OutputIteratorT,
650
+ typename NumItemsT,
651
+ typename EnvT = ::cuda::std::execution::env<>>
652
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
653
+ Sum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
654
+ {
655
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
656
+
657
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
658
+ "Determinism should be used inside requires to have an effect.");
659
+ using requirements_t = ::cuda::std::execution::
660
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
661
+ using default_determinism_t =
662
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
663
+ ::cuda::execution::determinism::__get_determinism_t,
664
+ ::cuda::execution::determinism::run_to_run_t>;
665
+
666
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
667
+
668
+ // The output iterator must be a contiguous iterator or we fall back to
669
+ // run-to-run determinism.
670
+ constexpr auto is_contiguous_fallback =
671
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
672
+
673
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
674
+
675
+ // Since atomics for types of size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
676
+ // determinism.
677
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(OutputT) >= 4;
678
+
679
+ using determinism_t =
680
+ ::cuda::std::conditional_t<no_determinism && !(is_contiguous_fallback && is_4b_or_greater),
681
+ ::cuda::execution::determinism::run_to_run_t,
682
+ default_determinism_t>;
683
+
684
+ // Query relevant properties from the environment
685
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
686
+ auto mr =
687
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
688
+
689
+ void* d_temp_storage = nullptr;
690
+ size_t temp_storage_bytes = 0;
691
+
692
+ using tuning_t =
693
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
694
+
695
+ using InitT = OutputT;
696
+
697
+ // Query the required temporary storage size
698
+ cudaError_t error = reduce_impl<tuning_t>(
699
+ d_temp_storage,
700
+ temp_storage_bytes,
701
+ d_in,
702
+ d_out,
703
+ num_items,
704
+ ::cuda::std::plus<>{},
705
+ ::cuda::std::identity{},
706
+ InitT{}, // zero-initialize
707
+ determinism_t{},
708
+ stream.get());
709
+ if (error != cudaSuccess)
710
+ {
711
+ return error;
712
+ }
713
+
714
+ // TODO(gevtushenko): use uninitialized buffer when it's available
715
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
716
+ if (error != cudaSuccess)
717
+ {
718
+ return error;
719
+ }
720
+
721
+ // Run the algorithm
722
+ error = reduce_impl<tuning_t>(
723
+ d_temp_storage,
724
+ temp_storage_bytes,
725
+ d_in,
726
+ d_out,
727
+ num_items,
728
+ ::cuda::std::plus<>{},
729
+ ::cuda::std::identity{},
730
+ InitT{}, // zero-initialize
731
+ determinism_t{},
732
+ stream.get());
733
+
734
+ // Try to deallocate regardless of the error to avoid memory leaks
735
+ cudaError_t deallocate_error =
736
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
737
+
738
+ if (error != cudaSuccess)
739
+ {
740
+ // Reduction error takes precedence over deallocation error since it happens first
741
+ return error;
742
+ }
743
+
744
+ return deallocate_error;
745
+ }
746
+
747
+ //! @rst
748
+ //! Computes a device-wide sum using the addition (``+``) operator.
749
+ //!
750
+ //! - Uses ``0`` as the initial value of the reduction.
751
+ //! - Does not support ``+`` operators that are non-commutative.
752
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
753
+ //! (e.g., addition of floating point types) on the same GPU device.
754
+ //! However, results for pseudo-associative reduction may be inconsistent
755
+ //! from one device to a another device of a different compute-capability
756
+ //! because CUB can employ different tile-sizing for different architectures.
757
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
758
+ //! - @devicestorage
759
+ //!
760
+ //! Snippet
761
+ //! +++++++++++++++++++++++++++++++++++++++++++++
762
+ //!
763
+ //! The code snippet below illustrates the sum-reduction of a device vector
764
+ //! of ``int`` data elements.
765
+ //!
766
+ //! .. code-block:: c++
767
+ //!
768
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
769
+ //!
770
+ //! // Declare, allocate, and initialize device-accessible pointers
771
+ //! // for input and output
772
+ //! int num_items; // e.g., 7
773
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
774
+ //! int *d_out; // e.g., [-]
775
+ //! ...
776
+ //!
777
+ //! // Determine temporary device storage requirements
778
+ //! void *d_temp_storage = nullptr;
779
+ //! size_t temp_storage_bytes = 0;
780
+ //! cub::DeviceReduce::Sum(
781
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
782
+ //!
783
+ //! // Allocate temporary storage
784
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
785
+ //!
786
+ //! // Run sum-reduction
787
+ //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
788
+ //!
789
+ //! // d_out <-- [38]
790
+ //!
791
+ //! @endrst
792
+ //!
793
+ //! @tparam InputIteratorT
794
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
795
+ //!
796
+ //! @tparam OutputIteratorT
797
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
798
+ //!
799
+ //! @tparam NumItemsT
800
+ //! **[inferred]** Type of num_items
801
+ //!
802
+ //! @param[in] d_temp_storage
803
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
804
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
805
+ //!
806
+ //! @param[in,out] temp_storage_bytes
807
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
808
+ //!
809
+ //! @param[in] d_in
810
+ //! Pointer to the input sequence of data items
811
+ //!
812
+ //! @param[out] d_out
813
+ //! Pointer to the output aggregate
814
+ //!
815
+ //! @param[in] num_items
816
+ //! Total number of input items (i.e., length of `d_in`)
817
+ //!
818
+ //! @param[in] stream
819
+ //! @rst
820
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
821
+ //! @endrst
822
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
823
+ CUB_RUNTIME_FUNCTION static cudaError_t
824
+ Sum(void* d_temp_storage,
825
+ size_t& temp_storage_bytes,
826
+ InputIteratorT d_in,
827
+ OutputIteratorT d_out,
828
+ NumItemsT num_items,
829
+ cudaStream_t stream = 0)
830
+ {
831
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
832
+
833
+ // Signed integer type for global offsets
834
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
835
+
836
+ // The output value type
837
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
838
+
839
+ using InitT = OutputT;
840
+
841
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::std::plus<>, InitT>::Dispatch(
842
+ d_temp_storage,
843
+ temp_storage_bytes,
844
+ d_in,
845
+ d_out,
846
+ static_cast<OffsetT>(num_items),
847
+ ::cuda::std::plus<>{},
848
+ InitT{}, // zero-initialize
849
+ stream);
850
+ }
851
+
852
+ //! @rst
853
+ //! Computes a device-wide minimum using the less-than (``<``) operator.
854
+ //!
855
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
856
+ //! - Does not support ``<`` operators that are non-commutative.
857
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
858
+ //! (e.g., addition of floating point types) on the same GPU device.
859
+ //! However, results for pseudo-associative reduction may be inconsistent
860
+ //! from one device to a another device of a different compute-capability
861
+ //! because CUB can employ different tile-sizing for different architectures.
862
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
863
+ //! - @devicestorage
864
+ //!
865
+ //! Snippet
866
+ //! +++++++++++++++++++++++++++++++++++++++++++++
867
+ //!
868
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
869
+ //!
870
+ //! .. code-block:: c++
871
+ //!
872
+ //! #include <cub/cub.cuh>
873
+ //! // or equivalently <cub/device/device_reduce.cuh>
874
+ //!
875
+ //! // Declare, allocate, and initialize device-accessible pointers
876
+ //! // for input and output
877
+ //! int num_items; // e.g., 7
878
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
879
+ //! int *d_out; // e.g., [-]
880
+ //! ...
881
+ //!
882
+ //! // Determine temporary device storage requirements
883
+ //! void *d_temp_storage = nullptr;
884
+ //! size_t temp_storage_bytes = 0;
885
+ //! cub::DeviceReduce::Min(
886
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
887
+ //!
888
+ //! // Allocate temporary storage
889
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
890
+ //!
891
+ //! // Run min-reduction
892
+ //! cub::DeviceReduce::Min(
893
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
894
+ //!
895
+ //! // d_out <-- [0]
896
+ //!
897
+ //! @endrst
898
+ //!
899
+ //! @tparam InputIteratorT
900
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
901
+ //!
902
+ //! @tparam OutputIteratorT
903
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
904
+ //!
905
+ //! @tparam NumItemsT
906
+ //! **[inferred]** Type of num_items
907
+ //!
908
+ //! @param[in] d_temp_storage
909
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
910
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
911
+ //!
912
+ //! @param[in,out] temp_storage_bytes
913
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
914
+ //!
915
+ //! @param[in] d_in
916
+ //! Pointer to the input sequence of data items
917
+ //!
918
+ //! @param[out] d_out
919
+ //! Pointer to the output aggregate
920
+ //!
921
+ //! @param[in] num_items
922
+ //! Total number of input items (i.e., length of ``d_in``)
923
+ //!
924
+ //! @param[in] stream
925
+ //! @rst
926
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
927
+ //! @endrst
928
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
929
+ CUB_RUNTIME_FUNCTION static cudaError_t
930
+ Min(void* d_temp_storage,
931
+ size_t& temp_storage_bytes,
932
+ InputIteratorT d_in,
933
+ OutputIteratorT d_out,
934
+ NumItemsT num_items,
935
+ cudaStream_t stream = 0)
936
+ {
937
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
938
+
939
+ using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
940
+ using InputT = detail::it_value_t<InputIteratorT>;
941
+ using InitT = InputT;
942
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
943
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
944
+ static_assert(limits_t::is_specialized,
945
+ "cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
946
+ "value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This is "
947
+ "probably a bug and you should specialize cuda::std::numeric_limits. Define "
948
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
949
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
950
+
951
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::minimum<>, InitT>::Dispatch(
952
+ d_temp_storage,
953
+ temp_storage_bytes,
954
+ d_in,
955
+ d_out,
956
+ static_cast<OffsetT>(num_items),
957
+ ::cuda::minimum<>{},
958
+ limits_t::max(),
959
+ stream);
960
+ }
961
+
962
+ //! @rst
963
+ //! Computes a device-wide minimum using the less-than (``<``) operator. The result is written to the output
964
+ //! iterator.
965
+ //!
966
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
967
+ //! - Provides determinism based on the environment's determinism requirements.
968
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
969
+ //! as the `env` parameter.
970
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
971
+ //!
972
+ //! Snippet
973
+ //! +++++++++++++++++++++++++++++++++++++++++++++
974
+ //!
975
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
976
+ //!
977
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
978
+ //! :language: c++
979
+ //! :dedent:
980
+ //! :start-after: example-begin min-env-determinism
981
+ //! :end-before: example-end min-env-determinism
982
+ //!
983
+ //! @endrst
984
+ //!
985
+ //! @tparam InputIteratorT
986
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
987
+ //!
988
+ //! @tparam OutputIteratorT
989
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
990
+ //!
991
+ //! @tparam NumItemsT
992
+ //! **[inferred]** Type of num_items
993
+ //!
994
+ //! @tparam EnvT
995
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
996
+ //!
997
+ //! @param[in] d_in
998
+ //! Pointer to the input sequence of data items
999
+ //!
1000
+ //! @param[out] d_out
1001
+ //! Pointer to the output aggregate
1002
+ //!
1003
+ //! @param[in] num_items
1004
+ //! Total number of input items (i.e., length of ``d_in``)
1005
+ //!
1006
+ //! @param[in] env
1007
+ //! @rst
1008
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
1009
+ //! @endrst
1010
+ template <typename InputIteratorT,
1011
+ typename OutputIteratorT,
1012
+ typename NumItemsT,
1013
+ typename EnvT = ::cuda::std::execution::env<>>
1014
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1015
+ Min(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1016
+ {
1017
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Min");
1018
+
1019
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1020
+ "Determinism should be used inside requires to have an effect.");
1021
+ using requirements_t =
1022
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1023
+ using requested_determinism_t =
1024
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1025
+ _CUDA_EXEC::determinism::__get_determinism_t,
1026
+ _CUDA_EXEC::determinism::run_to_run_t>;
1027
+
1028
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1029
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1030
+ "gpu_to_gpu determinism is not supported");
1031
+
1032
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1033
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1034
+
1035
+ // Query relevant properties from the environment
1036
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1037
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1038
+
1039
+ void* d_temp_storage = nullptr;
1040
+ size_t temp_storage_bytes = 0;
1041
+
1042
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1043
+
1044
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1045
+
1046
+ using InitT = OutputT;
1047
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1048
+
1049
+ // Query the required temporary storage size
1050
+ cudaError_t error = reduce_impl<tuning_t>(
1051
+ d_temp_storage,
1052
+ temp_storage_bytes,
1053
+ d_in,
1054
+ d_out,
1055
+ num_items,
1056
+ ::cuda::minimum<>{},
1057
+ ::cuda::std::identity{},
1058
+ limits_t::max(),
1059
+ determinism_t{},
1060
+ stream.get());
1061
+ if (error != cudaSuccess)
1062
+ {
1063
+ return error;
1064
+ }
1065
+
1066
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1067
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1068
+ if (error != cudaSuccess)
1069
+ {
1070
+ return error;
1071
+ }
1072
+
1073
+ // Run the algorithm
1074
+ error = reduce_impl<tuning_t>(
1075
+ d_temp_storage,
1076
+ temp_storage_bytes,
1077
+ d_in,
1078
+ d_out,
1079
+ num_items,
1080
+ ::cuda::minimum<>{},
1081
+ ::cuda::std::identity{},
1082
+ limits_t::max(),
1083
+ determinism_t{},
1084
+ stream.get());
1085
+
1086
+ // Try to deallocate regardless of the error to avoid memory leaks
1087
+ cudaError_t deallocate_error =
1088
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1089
+
1090
+ if (error != cudaSuccess)
1091
+ {
1092
+ // Reduction error takes precedence over deallocation error since it happens first
1093
+ return error;
1094
+ }
1095
+
1096
+ return deallocate_error;
1097
+ }
1098
+
1099
+ //! @rst
1100
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1101
+ //!
1102
+ //! - The minimum is written to ``d_min_out``
1103
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1104
+ //! ``cuda::std::int64_t``.
1105
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1106
+ //! ``1`` is written to ``d_index_out``.
1107
+ //! - Does not support ``<`` operators that are non-commutative.
1108
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1109
+ //! (e.g., addition of floating point types) on the same GPU device.
1110
+ //! However, results for pseudo-associative reduction may be inconsistent
1111
+ //! from one device to a another device of a different compute-capability
1112
+ //! because CUB can employ different tile-sizing for different architectures.
1113
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1114
+ //! - @devicestorage
1115
+ //!
1116
+ //! Snippet
1117
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1118
+ //!
1119
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1120
+ //! of ``int`` data elements.
1121
+ //!
1122
+ //! .. code-block:: c++
1123
+ //!
1124
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1125
+ //! #include <cuda/std/cstdint>
1126
+ //!
1127
+ //! // Declare, allocate, and initialize device-accessible pointers
1128
+ //! // for input and output
1129
+ //! int num_items; // e.g., 7
1130
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1131
+ //! int *d_min_out; // memory for the minimum value
1132
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1133
+ //! ...
1134
+ //!
1135
+ //! // Determine temporary device storage requirements
1136
+ //! void *d_temp_storage = nullptr;
1137
+ //! size_t temp_storage_bytes = 0;
1138
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1139
+ //! num_items);
1140
+ //!
1141
+ //! // Allocate temporary storage
1142
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1143
+ //!
1144
+ //! // Run argmin-reduction
1145
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
1146
+ //! num_items);
1147
+ //!
1148
+ //! // d_min_out <-- 0
1149
+ //! // d_index_out <-- 5
1150
+ //!
1151
+ //! @endrst
1152
+ //!
1153
+ //! @tparam InputIteratorT
1154
+ //! **[inferred]** Random-access input iterator type for reading input items
1155
+ //! (of some type `T`) @iterator
1156
+ //!
1157
+ //! @tparam ExtremumOutIteratorT
1158
+ //! **[inferred]** Output iterator type for recording minimum value
1159
+ //!
1160
+ //! @tparam IndexOutIteratorT
1161
+ //! **[inferred]** Output iterator type for recording index of the returned value
1162
+ //!
1163
+ //! @param[in] d_temp_storage
1164
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1165
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1166
+ //!
1167
+ //! @param[in,out] temp_storage_bytes
1168
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1169
+ //!
1170
+ //! @param[in] d_in
1171
+ //! Iterator to the input sequence of data items
1172
+ //!
1173
+ //! @param[out] d_min_out
1174
+ //! Iterator to which the minimum value is written
1175
+ //!
1176
+ //! @param[out] d_index_out
1177
+ //! Iterator to which the index of the returned value is written
1178
+ //!
1179
+ //! @param[in] num_items
1180
+ //! Total number of input items (i.e., length of ``d_in``)
1181
+ //!
1182
+ //! @param[in] stream
1183
+ //! @rst
1184
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1185
+ //! @endrst
1186
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1187
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
1188
+ void* d_temp_storage,
1189
+ size_t& temp_storage_bytes,
1190
+ InputIteratorT d_in,
1191
+ ExtremumOutIteratorT d_min_out,
1192
+ IndexOutIteratorT d_index_out,
1193
+ ::cuda::std::int64_t num_items,
1194
+ cudaStream_t stream = 0)
1195
+ {
1196
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1197
+
1198
+ // The input type
1199
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1200
+
1201
+ // Offset type used within the kernel and to index within one partition
1202
+ using PerPartitionOffsetT = int;
1203
+
1204
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1205
+ using GlobalOffsetT = ::cuda::std::int64_t;
1206
+
1207
+ // The value type used for the extremum
1208
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1209
+ using InitT = OutputExtremumT;
1210
+
1211
+ // Reduction operation
1212
+ using ReduceOpT = cub::ArgMin;
1213
+
1214
+ // Initial value
1215
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1216
+
1217
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1218
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1219
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1220
+
1221
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1222
+ InputIteratorT,
1223
+ decltype(out_it),
1224
+ PerPartitionOffsetT,
1225
+ GlobalOffsetT,
1226
+ ReduceOpT,
1227
+ InitT>::Dispatch(d_temp_storage,
1228
+ temp_storage_bytes,
1229
+ d_in,
1230
+ out_it,
1231
+ static_cast<GlobalOffsetT>(num_items),
1232
+ ReduceOpT{},
1233
+ initial_value,
1234
+ stream);
1235
+ }
1236
+
1237
+ //! @rst
1238
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1239
+ //!
1240
+ //! - The minimum is written to ``d_min_out``
1241
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1242
+ //! ``cuda::std::int64_t``.
1243
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1244
+ //! ``1`` is written to ``d_index_out``.
1245
+ //! - Does not support ``<`` operators that are non-commutative.
1246
+ //! - Provides determinism based on the environment's determinism requirements.
1247
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1248
+ //! as the `env` parameter.
1249
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1250
+ //!
1251
+ //! Snippet
1252
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1253
+ //!
1254
+ //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
1255
+ //!
1256
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1257
+ //! :language: c++
1258
+ //! :dedent:
1259
+ //! :start-after: example-begin argmin-env-determinism
1260
+ //! :end-before: example-end argmin-env-determinism
1261
+ //!
1262
+ //! @endrst
1263
+ //!
1264
+ //! @tparam InputIteratorT
1265
+ //! **[inferred]** Random-access input iterator type for reading input items
1266
+ //! (of some type `T`) @iterator
1267
+ //!
1268
+ //! @tparam ExtremumOutIteratorT
1269
+ //! **[inferred]** Output iterator type for recording minimum value
1270
+ //!
1271
+ //! @tparam IndexOutIteratorT
1272
+ //! **[inferred]** Output iterator type for recording index of the returned value
1273
+ //!
1274
+ //! @tparam EnvT
1275
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
1276
+ //!
1277
+ //! @param[in] d_in
1278
+ //! Iterator to the input sequence of data items
1279
+ //!
1280
+ //! @param[out] d_min_out
1281
+ //! Iterator to which the minimum value is written
1282
+ //!
1283
+ //! @param[out] d_index_out
1284
+ //! Iterator to which the index of the returned value is written
1285
+ //!
1286
+ //! @param[in] num_items
1287
+ //! Total number of input items (i.e., length of ``d_in``)
1288
+ //!
1289
+ //! @param[in] env
1290
+ //! @rst
1291
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1292
+ //! @endrst
1293
+ template <typename InputIteratorT,
1294
+ typename ExtremumOutIteratorT,
1295
+ typename IndexOutIteratorT,
1296
+ typename EnvT = ::cuda::std::execution::env<>>
1297
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1298
+ ArgMin(InputIteratorT d_in,
1299
+ ExtremumOutIteratorT d_min_out,
1300
+ IndexOutIteratorT d_index_out,
1301
+ ::cuda::std::int64_t num_items,
1302
+ EnvT env = {})
1303
+ {
1304
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
1305
+
1306
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1307
+ "Determinism should be used inside requires to have an effect.");
1308
+ using requirements_t =
1309
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1310
+ using requested_determinism_t =
1311
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1312
+ _CUDA_EXEC::determinism::__get_determinism_t,
1313
+ _CUDA_EXEC::determinism::run_to_run_t>;
1314
+
1315
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1316
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1317
+ "gpu_to_gpu determinism is not supported");
1318
+
1319
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1320
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1321
+
1322
+ // Query relevant properties from the environment
1323
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1324
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1325
+
1326
+ void* d_temp_storage = nullptr;
1327
+ size_t temp_storage_bytes = 0;
1328
+
1329
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1330
+
1331
+ // Reduction operation
1332
+ using ReduceOpT = cub::ArgMin;
1333
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1334
+ using PerPartitionOffsetT = int;
1335
+ using GlobalOffsetT = ::cuda::std::int64_t;
1336
+
1337
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1338
+ using InitT = OutputExtremumT;
1339
+
1340
+ // Initial value
1341
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1342
+
1343
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1344
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1345
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1346
+
1347
+ // Query the required temporary storage size
1348
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
1349
+ InputIteratorT,
1350
+ decltype(out_it),
1351
+ PerPartitionOffsetT,
1352
+ GlobalOffsetT,
1353
+ ReduceOpT,
1354
+ InitT>::Dispatch(d_temp_storage,
1355
+ temp_storage_bytes,
1356
+ d_in,
1357
+ out_it,
1358
+ static_cast<GlobalOffsetT>(num_items),
1359
+ ReduceOpT{},
1360
+ initial_value,
1361
+ stream.get());
1362
+ if (error != cudaSuccess)
1363
+ {
1364
+ return error;
1365
+ }
1366
+
1367
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1368
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1369
+ if (error != cudaSuccess)
1370
+ {
1371
+ return error;
1372
+ }
1373
+
1374
+ // Run the algorithm
1375
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
1376
+ InputIteratorT,
1377
+ decltype(out_it),
1378
+ PerPartitionOffsetT,
1379
+ GlobalOffsetT,
1380
+ ReduceOpT,
1381
+ InitT>::Dispatch(d_temp_storage,
1382
+ temp_storage_bytes,
1383
+ d_in,
1384
+ out_it,
1385
+ static_cast<GlobalOffsetT>(num_items),
1386
+ ReduceOpT{},
1387
+ initial_value,
1388
+ stream.get());
1389
+
1390
+ // Try to deallocate regardless of the error to avoid memory leaks
1391
+ cudaError_t deallocate_error =
1392
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1393
+
1394
+ if (error != cudaSuccess)
1395
+ {
1396
+ // Reduction error takes precedence over deallocation error since it happens first
1397
+ return error;
1398
+ }
1399
+
1400
+ return deallocate_error;
1401
+ }
1402
+
1403
+ //! @rst
1404
+ //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
1405
+ //!
1406
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1407
+ //! (assuming the value type of ``d_in`` is ``T``)
1408
+ //!
1409
+ //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
1410
+ //! - The ``{1, cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
1411
+ //!
1412
+ //! - Does not support ``<`` operators that are non-commutative.
1413
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1414
+ //! (e.g., addition of floating point types) on the same GPU device.
1415
+ //! However, results for pseudo-associative reduction may be inconsistent
1416
+ //! from one device to a another device of a different compute-capability
1417
+ //! because CUB can employ different tile-sizing for different architectures.
1418
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1419
+ //! - @devicestorage
1420
+ //!
1421
+ //! Snippet
1422
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1423
+ //!
1424
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1425
+ //! of ``int`` data elements.
1426
+ //!
1427
+ //! .. code-block:: c++
1428
+ //!
1429
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1430
+ //!
1431
+ //! // Declare, allocate, and initialize device-accessible pointers
1432
+ //! // for input and output
1433
+ //! int num_items; // e.g., 7
1434
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1435
+ //! KeyValuePair<int, int> *d_argmin; // e.g., [{-,-}]
1436
+ //! ...
1437
+ //!
1438
+ //! // Determine temporary device storage requirements
1439
+ //! void *d_temp_storage = nullptr;
1440
+ //! size_t temp_storage_bytes = 0;
1441
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1442
+ //!
1443
+ //! // Allocate temporary storage
1444
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1445
+ //!
1446
+ //! // Run argmin-reduction
1447
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1448
+ //!
1449
+ //! // d_argmin <-- [{5, 0}]
1450
+ //!
1451
+ //! @endrst
1452
+ //!
1453
+ //! @tparam InputIteratorT
1454
+ //! **[inferred]** Random-access input iterator type for reading input items
1455
+ //! (of some type `T`) @iterator
1456
+ //!
1457
+ //! @tparam OutputIteratorT
1458
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1459
+ //! (having value type ``cub::KeyValuePair<int, T>``) @iterator
1460
+ //!
1461
+ //! @param[in] d_temp_storage
1462
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1463
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1464
+ //!
1465
+ //! @param[in,out] temp_storage_bytes
1466
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1467
+ //!
1468
+ //! @param[in] d_in
1469
+ //! Pointer to the input sequence of data items
1470
+ //!
1471
+ //! @param[out] d_out
1472
+ //! Pointer to the output aggregate
1473
+ //!
1474
+ //! @param[in] num_items
1475
+ //! Total number of input items (i.e., length of ``d_in``)
1476
+ //!
1477
+ //! @param[in] stream
1478
+ //! @rst
1479
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1480
+ //! @endrst
1481
+ template <typename InputIteratorT, typename OutputIteratorT>
1482
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMin interface that takes two separate "
1483
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1484
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1485
+ ArgMin(void* d_temp_storage,
1486
+ size_t& temp_storage_bytes,
1487
+ InputIteratorT d_in,
1488
+ OutputIteratorT d_out,
1489
+ int num_items,
1490
+ cudaStream_t stream = 0)
1491
+ {
1492
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1493
+
1494
+ // Signed integer type for global offsets
1495
+ using OffsetT = int;
1496
+
1497
+ // The input type
1498
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1499
+
1500
+ // The output tuple type
1501
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1502
+
1503
+ using AccumT = OutputTupleT;
1504
+
1505
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1506
+
1507
+ // The output value type
1508
+ using OutputValueT = typename OutputTupleT::Value;
1509
+
1510
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1511
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1512
+
1513
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1514
+
1515
+ // Initial value
1516
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
1517
+
1518
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
1519
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
1520
+ }
1521
+
1522
+ //! @rst
1523
+ //! Computes a device-wide maximum using the greater-than (``>``) operator.
1524
+ //!
1525
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1526
+ //! - Does not support ``>`` operators that are non-commutative.
1527
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1528
+ //! (e.g., addition of floating point types) on the same GPU device.
1529
+ //! However, results for pseudo-associative reduction may be inconsistent
1530
+ //! from one device to a another device of a different compute-capability
1531
+ //! because CUB can employ different tile-sizing for different architectures.
1532
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1533
+ //! - @devicestorage
1534
+ //!
1535
+ //! Snippet
1536
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1537
+ //!
1538
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1539
+ //!
1540
+ //! .. code-block:: c++
1541
+ //!
1542
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1543
+ //!
1544
+ //! // Declare, allocate, and initialize device-accessible pointers
1545
+ //! // for input and output
1546
+ //! int num_items; // e.g., 7
1547
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1548
+ //! int *d_max; // e.g., [-]
1549
+ //! ...
1550
+ //!
1551
+ //! // Determine temporary device storage requirements
1552
+ //! void *d_temp_storage = nullptr;
1553
+ //! size_t temp_storage_bytes = 0;
1554
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1555
+ //!
1556
+ //! // Allocate temporary storage
1557
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1558
+ //!
1559
+ //! // Run max-reduction
1560
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1561
+ //!
1562
+ //! // d_max <-- [9]
1563
+ //!
1564
+ //! @endrst
1565
+ //!
1566
+ //! @tparam InputIteratorT
1567
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1568
+ //!
1569
+ //! @tparam OutputIteratorT
1570
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1571
+ //!
1572
+ //! @tparam NumItemsT
1573
+ //! **[inferred]** Type of num_items
1574
+ //!
1575
+ //! @param[in] d_temp_storage
1576
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1577
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1578
+ //!
1579
+ //! @param[in,out] temp_storage_bytes
1580
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1581
+ //!
1582
+ //! @param[in] d_in
1583
+ //! Pointer to the input sequence of data items
1584
+ //!
1585
+ //! @param[out] d_out
1586
+ //! Pointer to the output aggregate
1587
+ //!
1588
+ //! @param[in] num_items
1589
+ //! Total number of input items (i.e., length of ``d_in``)
1590
+ //!
1591
+ //! @param[in] stream
1592
+ //! @rst
1593
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1594
+ //! @endrst
1595
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1596
+ CUB_RUNTIME_FUNCTION static cudaError_t
1597
+ Max(void* d_temp_storage,
1598
+ size_t& temp_storage_bytes,
1599
+ InputIteratorT d_in,
1600
+ OutputIteratorT d_out,
1601
+ NumItemsT num_items,
1602
+ cudaStream_t stream = 0)
1603
+ {
1604
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
1605
+
1606
+ // Signed integer type for global offsets
1607
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1608
+ using InputT = detail::it_value_t<InputIteratorT>;
1609
+ using InitT = InputT;
1610
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1611
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1612
+ static_assert(limits_t::is_specialized,
1613
+ "cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
1614
+ "initial value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This "
1615
+ "is probably a bug and you should specialize cuda::std::numeric_limits. Define "
1616
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
1617
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1618
+
1619
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::maximum<>, InitT>::Dispatch(
1620
+ d_temp_storage,
1621
+ temp_storage_bytes,
1622
+ d_in,
1623
+ d_out,
1624
+ static_cast<OffsetT>(num_items),
1625
+ ::cuda::maximum<>{},
1626
+ limits_t::lowest(),
1627
+ stream);
1628
+ }
1629
+
1630
+ //! @rst
1631
+ //! Computes a device-wide maximum using the greater-than (``>``) operator. The result is written to the output
1632
+ //! iterator.
1633
+ //!
1634
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1635
+ //! - Provides determinism based on the environment's determinism requirements.
1636
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1637
+ //! as the `env` parameter.
1638
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1639
+ //!
1640
+ //! Snippet
1641
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1642
+ //!
1643
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1644
+ //!
1645
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1646
+ //! :language: c++
1647
+ //! :dedent:
1648
+ //! :start-after: example-begin max-env-determinism
1649
+ //! :end-before: example-end max-env-determinism
1650
+ //!
1651
+ //! @endrst
1652
+ //!
1653
+ //! @tparam InputIteratorT
1654
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1655
+ //!
1656
+ //! @tparam OutputIteratorT
1657
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1658
+ //!
1659
+ //! @tparam NumItemsT
1660
+ //! **[inferred]** Type of num_items
1661
+ //!
1662
+ //! @tparam EnvT
1663
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
1664
+ //!
1665
+ //! @param[in] d_in
1666
+ //! Pointer to the input sequence of data items
1667
+ //!
1668
+ //! @param[out] d_out
1669
+ //! Pointer to the output aggregate
1670
+ //!
1671
+ //! @param[in] num_items
1672
+ //! Total number of input items (i.e., length of ``d_in``)
1673
+ //!
1674
+ //! @param[in] env
1675
+ //! @rst
1676
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1677
+ //! @endrst
1678
+ template <typename InputIteratorT,
1679
+ typename OutputIteratorT,
1680
+ typename NumItemsT,
1681
+ typename EnvT = ::cuda::std::execution::env<>>
1682
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1683
+ Max(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1684
+ {
1685
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Max");
1686
+
1687
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
1688
+ "Determinism should be used inside requires to have an effect.");
1689
+ using requirements_t =
1690
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
1691
+ using requested_determinism_t =
1692
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
1693
+ _CUDA_EXEC::determinism::__get_determinism_t,
1694
+ _CUDA_EXEC::determinism::run_to_run_t>;
1695
+
1696
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1697
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1698
+ "gpu_to_gpu determinism is not supported");
1699
+
1700
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1701
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1702
+
1703
+ // Query relevant properties from the environment
1704
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1705
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1706
+
1707
+ void* d_temp_storage = nullptr;
1708
+ size_t temp_storage_bytes = 0;
1709
+
1710
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
1711
+
1712
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1713
+
1714
+ using InitT = OutputT;
1715
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1716
+
1717
+ // Query the required temporary storage size
1718
+ cudaError_t error = reduce_impl<tuning_t>(
1719
+ d_temp_storage,
1720
+ temp_storage_bytes,
1721
+ d_in,
1722
+ d_out,
1723
+ num_items,
1724
+ ::cuda::maximum<>{},
1725
+ ::cuda::std::identity{},
1726
+ limits_t::lowest(),
1727
+ determinism_t{},
1728
+ stream.get());
1729
+ if (error != cudaSuccess)
1730
+ {
1731
+ return error;
1732
+ }
1733
+
1734
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1735
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1736
+ if (error != cudaSuccess)
1737
+ {
1738
+ return error;
1739
+ }
1740
+
1741
+ // Run the algorithm
1742
+ error = reduce_impl<tuning_t>(
1743
+ d_temp_storage,
1744
+ temp_storage_bytes,
1745
+ d_in,
1746
+ d_out,
1747
+ num_items,
1748
+ ::cuda::maximum<>{},
1749
+ ::cuda::std::identity{},
1750
+ limits_t::lowest(),
1751
+ determinism_t{},
1752
+ stream.get());
1753
+
1754
+ // Try to deallocate regardless of the error to avoid memory leaks
1755
+ cudaError_t deallocate_error =
1756
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1757
+
1758
+ if (error != cudaSuccess)
1759
+ {
1760
+ // Reduction error takes precedence over deallocation error since it happens first
1761
+ return error;
1762
+ }
1763
+
1764
+ return deallocate_error;
1765
+ }
1766
+
1767
+ //! @rst
1768
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1769
+ //! item.
1770
+ //!
1771
+ //! - The maximum is written to ``d_max_out``
1772
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1773
+ //! ``cuda::std::int64_t``.
1774
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out`` and the index
1775
+ //! ``1`` is written to ``d_index_out``.
1776
+ //! - Does not support ``>`` operators that are non-commutative.
1777
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1778
+ //! (e.g., addition of floating point types) on the same GPU device.
1779
+ //! However, results for pseudo-associative reduction may be inconsistent
1780
+ //! from one device to a another device of a different compute-capability
1781
+ //! because CUB can employ different tile-sizing for different architectures.
1782
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1783
+ //! - @devicestorage
1784
+ //!
1785
+ //! Snippet
1786
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1787
+ //!
1788
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1789
+ //! of `int` data elements.
1790
+ //!
1791
+ //! .. code-block:: c++
1792
+ //!
1793
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1794
+ //! #include <cuda/std/cstdint>
1795
+ //!
1796
+ //! // Declare, allocate, and initialize device-accessible pointers
1797
+ //! // for input and output
1798
+ //! int num_items; // e.g., 7
1799
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1800
+ //! int *d_max_out; // memory for the maximum value
1801
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1802
+ //! ...
1803
+ //!
1804
+ //! // Determine temporary device storage requirements
1805
+ //! void *d_temp_storage = nullptr;
1806
+ //! size_t temp_storage_bytes = 0;
1807
+ //! cub::DeviceReduce::ArgMax(
1808
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1809
+ //!
1810
+ //! // Allocate temporary storage
1811
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1812
+ //!
1813
+ //! // Run argmax-reduction
1814
+ //! cub::DeviceReduce::ArgMax(
1815
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1816
+ //!
1817
+ //! // d_max_out <-- 9
1818
+ //! // d_index_out <-- 6
1819
+ //!
1820
+ //! @endrst
1821
+ //!
1822
+ //! @tparam InputIteratorT
1823
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1824
+ //!
1825
+ //! @tparam ExtremumOutIteratorT
1826
+ //! **[inferred]** Output iterator type for recording maximum value
1827
+ //!
1828
+ //! @tparam IndexOutIteratorT
1829
+ //! **[inferred]** Output iterator type for recording index of the returned value
1830
+ //!
1831
+ //! @param[in] d_temp_storage
1832
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1833
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1834
+ //!
1835
+ //! @param[in,out] temp_storage_bytes
1836
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1837
+ //!
1838
+ //! @param[in] d_in
1839
+ //! Pointer to the input sequence of data items
1840
+ //!
1841
+ //! @param[out] d_max_out
1842
+ //! Iterator to which the maximum value is written
1843
+ //!
1844
+ //! @param[out] d_index_out
1845
+ //! Iterator to which the index of the returned value is written
1846
+ //!
1847
+ //! @param[in] num_items
1848
+ //! Total number of input items (i.e., length of ``d_in``)
1849
+ //!
1850
+ //! @param[in] stream
1851
+ //! @rst
1852
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1853
+ //! @endrst
1854
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1855
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
1856
+ void* d_temp_storage,
1857
+ size_t& temp_storage_bytes,
1858
+ InputIteratorT d_in,
1859
+ ExtremumOutIteratorT d_max_out,
1860
+ IndexOutIteratorT d_index_out,
1861
+ ::cuda::std::int64_t num_items,
1862
+ cudaStream_t stream = 0)
1863
+ {
1864
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1865
+
1866
+ // The input type
1867
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1868
+
1869
+ // Offset type used within the kernel and to index within one partition
1870
+ using PerPartitionOffsetT = int;
1871
+
1872
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1873
+ using GlobalOffsetT = ::cuda::std::int64_t;
1874
+
1875
+ // The value type used for the extremum
1876
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1877
+ using InitT = OutputExtremumT;
1878
+
1879
+ // Reduction operation
1880
+ using ReduceOpT = cub::ArgMax;
1881
+
1882
+ // Initial value
1883
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1884
+
1885
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1886
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1887
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1888
+
1889
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1890
+ InputIteratorT,
1891
+ decltype(out_it),
1892
+ PerPartitionOffsetT,
1893
+ GlobalOffsetT,
1894
+ ReduceOpT,
1895
+ InitT>::Dispatch(d_temp_storage,
1896
+ temp_storage_bytes,
1897
+ d_in,
1898
+ out_it,
1899
+ static_cast<GlobalOffsetT>(num_items),
1900
+ ReduceOpT{},
1901
+ initial_value,
1902
+ stream);
1903
+ }
1904
+
1905
+ //! @rst
1906
+ //! Finds the first device-wide maximum using the greater-than (``>``)
1907
+ //! operator, also returning the index of that item
1908
+ //!
1909
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1910
+ //! (assuming the value type of ``d_in`` is ``T``)
1911
+ //!
1912
+ //! - The maximum is written to ``d_out.value`` and its offset in the input
1913
+ //! array is written to ``d_out.key``.
1914
+ //! - The ``{1, cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
1915
+ //!
1916
+ //! - Does not support ``>`` operators that are non-commutative.
1917
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1918
+ //! (e.g., addition of floating point types) on the same GPU device.
1919
+ //! However, results for pseudo-associative reduction may be inconsistent
1920
+ //! from one device to a another device of a different compute-capability
1921
+ //! because CUB can employ different tile-sizing for different architectures.
1922
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1923
+ //! - @devicestorage
1924
+ //!
1925
+ //! Snippet
1926
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1927
+ //!
1928
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1929
+ //! of `int` data elements.
1930
+ //!
1931
+ //! .. code-block:: c++
1932
+ //!
1933
+ //! #include <cub/cub.cuh>
1934
+ //! // or equivalently <cub/device/device_reduce.cuh>
1935
+ //!
1936
+ //! // Declare, allocate, and initialize device-accessible pointers
1937
+ //! // for input and output
1938
+ //! int num_items; // e.g., 7
1939
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1940
+ //! KeyValuePair<int, int> *d_argmax; // e.g., [{-,-}]
1941
+ //! ...
1942
+ //!
1943
+ //! // Determine temporary device storage requirements
1944
+ //! void *d_temp_storage = nullptr;
1945
+ //! size_t temp_storage_bytes = 0;
1946
+ //! cub::DeviceReduce::ArgMax(
1947
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1948
+ //!
1949
+ //! // Allocate temporary storage
1950
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1951
+ //!
1952
+ //! // Run argmax-reduction
1953
+ //! cub::DeviceReduce::ArgMax(
1954
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1955
+ //!
1956
+ //! // d_argmax <-- [{6, 9}]
1957
+ //!
1958
+ //! @endrst
1959
+ //!
1960
+ //! @tparam InputIteratorT
1961
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1962
+ //!
1963
+ //! @tparam OutputIteratorT
1964
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1965
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1966
+ //!
1967
+ //! @param[in] d_temp_storage
1968
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1969
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1970
+ //!
1971
+ //! @param[in,out] temp_storage_bytes
1972
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1973
+ //!
1974
+ //! @param[in] d_in
1975
+ //! Pointer to the input sequence of data items
1976
+ //!
1977
+ //! @param[out] d_out
1978
+ //! Pointer to the output aggregate
1979
+ //!
1980
+ //! @param[in] num_items
1981
+ //! Total number of input items (i.e., length of ``d_in``)
1982
+ //!
1983
+ //! @param[in] stream
1984
+ //! @rst
1985
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1986
+ //! @endrst
1987
+ template <typename InputIteratorT, typename OutputIteratorT>
1988
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface that takes two separate "
1989
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1990
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1991
+ ArgMax(void* d_temp_storage,
1992
+ size_t& temp_storage_bytes,
1993
+ InputIteratorT d_in,
1994
+ OutputIteratorT d_out,
1995
+ int num_items,
1996
+ cudaStream_t stream = 0)
1997
+ {
1998
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1999
+
2000
+ // Signed integer type for global offsets
2001
+ using OffsetT = int;
2002
+
2003
+ // The input type
2004
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
2005
+
2006
+ // The output tuple type
2007
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
2008
+
2009
+ using AccumT = OutputTupleT;
2010
+
2011
+ // The output value type
2012
+ using OutputValueT = typename OutputTupleT::Value;
2013
+
2014
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
2015
+
2016
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
2017
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
2018
+
2019
+ ArgIndexInputIteratorT d_indexed_in(d_in);
2020
+
2021
+ // Initial value
2022
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
2023
+
2024
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
2025
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
2026
+ }
2027
+
2028
+ //! @rst
2029
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
2030
+ //! item.
2031
+ //!
2032
+ //! - The maximum is written to ``d_max_out``
2033
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
2034
+ //! ``cuda::std::int64_t``.
2035
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::lowest()}`` is written to ``d_max_out`` and the index
2036
+ //! ``1`` is written to ``d_index_out``.
2037
+ //! - Does not support ``>`` operators that are non-commutative.
2038
+ //! - Provides determinism based on the environment's determinism requirements.
2039
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
2040
+ //! as the `env` parameter.
2041
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_max_out`` nor ``d_index_out``.
2042
+ //!
2043
+ //! Snippet
2044
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2045
+ //!
2046
+ //! The code snippet below illustrates the argmax-reduction of a device vector of ``int`` data elements.
2047
+ //!
2048
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
2049
+ //! :language: c++
2050
+ //! :dedent:
2051
+ //! :start-after: example-begin argmax-env-determinism
2052
+ //! :end-before: example-end argmax-env-determinism
2053
+ //!
2054
+ //! @endrst
2055
+ //!
2056
+ //! @tparam InputIteratorT
2057
+ //! **[inferred]** Random-access input iterator type for reading input items
2058
+ //! (of some type `T`) @iterator
2059
+ //!
2060
+ //! @tparam ExtremumOutIteratorT
2061
+ //! **[inferred]** Output iterator type for recording maximum value
2062
+ //!
2063
+ //! @tparam IndexOutIteratorT
2064
+ //! **[inferred]** Output iterator type for recording index of the returned value
2065
+ //!
2066
+ //! @tparam EnvT
2067
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
2068
+ //!
2069
+ //! @param[in] d_in
2070
+ //! Iterator to the input sequence of data items
2071
+ //!
2072
+ //! @param[out] d_max_out
2073
+ //! Iterator to which the maximum value is written
2074
+ //!
2075
+ //! @param[out] d_index_out
2076
+ //! Iterator to which the index of the returned value is written
2077
+ //!
2078
+ //! @param[in] num_items
2079
+ //! Total number of input items (i.e., length of ``d_in``)
2080
+ //!
2081
+ //! @param[in] env
2082
+ //! @rst
2083
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
2084
+ //! @endrst
2085
+ template <typename InputIteratorT,
2086
+ typename ExtremumOutIteratorT,
2087
+ typename IndexOutIteratorT,
2088
+ typename EnvT = ::cuda::std::execution::env<>>
2089
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
2090
+ ArgMax(InputIteratorT d_in,
2091
+ ExtremumOutIteratorT d_max_out,
2092
+ IndexOutIteratorT d_index_out,
2093
+ ::cuda::std::int64_t num_items,
2094
+ EnvT env = {})
2095
+ {
2096
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
2097
+
2098
+ static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
2099
+ "Determinism should be used inside requires to have an effect.");
2100
+ using requirements_t =
2101
+ _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
2102
+ using requested_determinism_t =
2103
+ _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
2104
+ _CUDA_EXEC::determinism::__get_determinism_t,
2105
+ _CUDA_EXEC::determinism::run_to_run_t>;
2106
+
2107
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
2108
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
2109
+ "gpu_to_gpu determinism is not supported");
2110
+
2111
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
2112
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
2113
+
2114
+ // Query relevant properties from the environment
2115
+ auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
2116
+ auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
2117
+
2118
+ void* d_temp_storage = nullptr;
2119
+ size_t temp_storage_bytes = 0;
2120
+
2121
+ using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
2122
+
2123
+ // Reduction operation
2124
+ using ReduceOpT = cub::ArgMax;
2125
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
2126
+ using PerPartitionOffsetT = int;
2127
+ using GlobalOffsetT = ::cuda::std::int64_t;
2128
+
2129
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
2130
+ using InitT = OutputExtremumT;
2131
+
2132
+ // Initial value
2133
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
2134
+
2135
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
2136
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
2137
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
2138
+
2139
+ // Query the required temporary storage size
2140
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
2141
+ InputIteratorT,
2142
+ decltype(out_it),
2143
+ PerPartitionOffsetT,
2144
+ GlobalOffsetT,
2145
+ ReduceOpT,
2146
+ InitT>::Dispatch(d_temp_storage,
2147
+ temp_storage_bytes,
2148
+ d_in,
2149
+ out_it,
2150
+ static_cast<GlobalOffsetT>(num_items),
2151
+ ReduceOpT{},
2152
+ initial_value,
2153
+ stream.get());
2154
+ if (error != cudaSuccess)
2155
+ {
2156
+ return error;
2157
+ }
2158
+
2159
+ // TODO(gevtushenko): use uninitialized buffer when it's available
2160
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
2161
+ if (error != cudaSuccess)
2162
+ {
2163
+ return error;
2164
+ }
2165
+
2166
+ // Run the algorithm
2167
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
2168
+ InputIteratorT,
2169
+ decltype(out_it),
2170
+ PerPartitionOffsetT,
2171
+ GlobalOffsetT,
2172
+ ReduceOpT,
2173
+ InitT>::Dispatch(d_temp_storage,
2174
+ temp_storage_bytes,
2175
+ d_in,
2176
+ out_it,
2177
+ static_cast<GlobalOffsetT>(num_items),
2178
+ ReduceOpT{},
2179
+ initial_value,
2180
+ stream.get());
2181
+
2182
+ // Try to deallocate regardless of the error to avoid memory leaks
2183
+ cudaError_t deallocate_error =
2184
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
2185
+
2186
+ if (error != cudaSuccess)
2187
+ {
2188
+ // Reduction error takes precedence over deallocation error since it happens first
2189
+ return error;
2190
+ }
2191
+
2192
+ return deallocate_error;
2193
+ }
2194
+
2195
+ //! @rst
2196
+ //! Fuses transform and reduce operations
2197
+ //!
2198
+ //! - Does not support binary reduction operators that are non-commutative.
2199
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2200
+ //! (e.g., addition of floating point types) on the same GPU device.
2201
+ //! However, results for pseudo-associative reduction may be inconsistent
2202
+ //! from one device to a another device of a different compute-capability
2203
+ //! because CUB can employ different tile-sizing for different architectures.
2204
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
2205
+ //! - @devicestorage
2206
+ //!
2207
+ //! Snippet
2208
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2209
+ //!
2210
+ //! The code snippet below illustrates a user-defined min-reduction of a
2211
+ //! device vector of `int` data elements.
2212
+ //!
2213
+ //! .. code-block:: c++
2214
+ //!
2215
+ //! #include <cub/cub.cuh>
2216
+ //! // or equivalently <cub/device/device_reduce.cuh>
2217
+ //!
2218
+ //! thrust::device_vector<int> in = { 1, 2, 3, 4 };
2219
+ //! thrust::device_vector<int> out(1);
2220
+ //!
2221
+ //! size_t temp_storage_bytes = 0;
2222
+ //! uint8_t *d_temp_storage = nullptr;
2223
+ //!
2224
+ //! const int init = 42;
2225
+ //!
2226
+ //! cub::DeviceReduce::TransformReduce(
2227
+ //! d_temp_storage,
2228
+ //! temp_storage_bytes,
2229
+ //! in.begin(),
2230
+ //! out.begin(),
2231
+ //! in.size(),
2232
+ //! cuda::std::plus<>{},
2233
+ //! square_t{},
2234
+ //! init);
2235
+ //!
2236
+ //! thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
2237
+ //! d_temp_storage = temp_storage.data().get();
2238
+ //!
2239
+ //! cub::DeviceReduce::TransformReduce(
2240
+ //! d_temp_storage,
2241
+ //! temp_storage_bytes,
2242
+ //! in.begin(),
2243
+ //! out.begin(),
2244
+ //! in.size(),
2245
+ //! cuda::std::plus<>{},
2246
+ //! square_t{},
2247
+ //! init);
2248
+ //!
2249
+ //! // out[0] <-- 72
2250
+ //!
2251
+ //! @endrst
2252
+ //!
2253
+ //! @tparam InputIteratorT
2254
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
2255
+ //!
2256
+ //! @tparam OutputIteratorT
2257
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
2258
+ //!
2259
+ //! @tparam ReductionOpT
2260
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2261
+ //!
2262
+ //! @tparam TransformOpT
2263
+ //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
2264
+ //!
2265
+ //! @tparam T
2266
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
2267
+ //!
2268
+ //! @tparam NumItemsT
2269
+ //! **[inferred]** Type of num_items
2270
+ //!
2271
+ //! @param[in] d_temp_storage
2272
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2273
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2274
+ //!
2275
+ //! @param[in,out] temp_storage_bytes
2276
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2277
+ //!
2278
+ //! @param[in] d_in
2279
+ //! Pointer to the input sequence of data items
2280
+ //!
2281
+ //! @param[out] d_out
2282
+ //! Pointer to the output aggregate
2283
+ //!
2284
+ //! @param[in] num_items
2285
+ //! Total number of input items (i.e., length of ``d_in``)
2286
+ //!
2287
+ //! @param[in] reduction_op
2288
+ //! Binary reduction functor
2289
+ //!
2290
+ //! @param[in] transform_op
2291
+ //! Unary transform functor
2292
+ //!
2293
+ //! @param[in] init
2294
+ //! Initial value of the reduction
2295
+ //!
2296
+ //! @param[in] stream
2297
+ //! @rst
2298
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2299
+ //! @endrst
2300
+ template <typename InputIteratorT,
2301
+ typename OutputIteratorT,
2302
+ typename ReductionOpT,
2303
+ typename TransformOpT,
2304
+ typename T,
2305
+ typename NumItemsT>
2306
+ CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
2307
+ void* d_temp_storage,
2308
+ size_t& temp_storage_bytes,
2309
+ InputIteratorT d_in,
2310
+ OutputIteratorT d_out,
2311
+ NumItemsT num_items,
2312
+ ReductionOpT reduction_op,
2313
+ TransformOpT transform_op,
2314
+ T init,
2315
+ cudaStream_t stream = 0)
2316
+ {
2317
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
2318
+
2319
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2320
+
2321
+ return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
2322
+ d_temp_storage,
2323
+ temp_storage_bytes,
2324
+ d_in,
2325
+ d_out,
2326
+ static_cast<OffsetT>(num_items),
2327
+ reduction_op,
2328
+ init,
2329
+ stream,
2330
+ transform_op);
2331
+ }
2332
+
2333
+ //! @rst
2334
+ //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
2335
+ //!
2336
+ //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
2337
+ //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
2338
+ //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
2339
+ //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
2340
+ //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
2341
+ //!
2342
+ //! - The ``==`` equality operator is used to determine whether keys are equivalent
2343
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2344
+ //! (e.g., addition of floating point types) on the same GPU device.
2345
+ //! However, results for pseudo-associative reduction may be inconsistent
2346
+ //! from one device to a another device of a different compute-capability
2347
+ //! because CUB can employ different tile-sizing for different architectures.
2348
+ //! - Let ``out`` be any of
2349
+ //! ``[d_unique_out, d_unique_out + *d_num_runs_out)``
2350
+ //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
2351
+ //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
2352
+ //! ``[d_keys_in, d_keys_in + num_items)``,
2353
+ //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
2354
+ //! - @devicestorage
2355
+ //!
2356
+ //! Snippet
2357
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2358
+ //!
2359
+ //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
2360
+ //! associated ``int`` keys.
2361
+ //!
2362
+ //! .. code-block:: c++
2363
+ //!
2364
+ //! #include <cub/cub.cuh>
2365
+ //! // or equivalently <cub/device/device_reduce.cuh>
2366
+ //!
2367
+ //! // CustomMin functor
2368
+ //! struct CustomMin
2369
+ //! {
2370
+ //! template <typename T>
2371
+ //! __device__ __forceinline__
2372
+ //! T operator()(const T &a, const T &b) const {
2373
+ //! return (b < a) ? b : a;
2374
+ //! }
2375
+ //! };
2376
+ //!
2377
+ //! // Declare, allocate, and initialize device-accessible pointers
2378
+ //! // for input and output
2379
+ //! int num_items; // e.g., 8
2380
+ //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
2381
+ //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
2382
+ //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
2383
+ //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
2384
+ //! int *d_num_runs_out; // e.g., [-]
2385
+ //! CustomMin reduction_op;
2386
+ //! ...
2387
+ //!
2388
+ //! // Determine temporary device storage requirements
2389
+ //! void *d_temp_storage = nullptr;
2390
+ //! size_t temp_storage_bytes = 0;
2391
+ //! cub::DeviceReduce::ReduceByKey(
2392
+ //! d_temp_storage, temp_storage_bytes,
2393
+ //! d_keys_in, d_unique_out, d_values_in,
2394
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2395
+ //!
2396
+ //! // Allocate temporary storage
2397
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2398
+ //!
2399
+ //! // Run reduce-by-key
2400
+ //! cub::DeviceReduce::ReduceByKey(
2401
+ //! d_temp_storage, temp_storage_bytes,
2402
+ //! d_keys_in, d_unique_out, d_values_in,
2403
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2404
+ //!
2405
+ //! // d_unique_out <-- [0, 2, 9, 5, 8]
2406
+ //! // d_aggregates_out <-- [0, 1, 6, 2, 4]
2407
+ //! // d_num_runs_out <-- [5]
2408
+ //!
2409
+ //! @endrst
2410
+ //!
2411
+ //! @tparam KeysInputIteratorT
2412
+ //! **[inferred]** Random-access input iterator type for reading input keys @iterator
2413
+ //!
2414
+ //! @tparam UniqueOutputIteratorT
2415
+ //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator
2416
+ //!
2417
+ //! @tparam ValuesInputIteratorT
2418
+ //! **[inferred]** Random-access input iterator type for reading input values @iterator
2419
+ //!
2420
+ //! @tparam AggregatesOutputIterator
2421
+ //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
2422
+ //!
2423
+ //! @tparam NumRunsOutputIteratorT
2424
+ //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator
2425
+ //!
2426
+ //! @tparam ReductionOpT
2427
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2428
+ //!
2429
+ //! @tparam NumItemsT
2430
+ //! **[inferred]** Type of num_items
2431
+ //!
2432
+ //! @param[in] d_temp_storage
2433
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2434
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2435
+ //!
2436
+ //! @param[in,out] temp_storage_bytes
2437
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2438
+ //!
2439
+ //! @param[in] d_keys_in
2440
+ //! Pointer to the input sequence of keys
2441
+ //!
2442
+ //! @param[out] d_unique_out
2443
+ //! Pointer to the output sequence of unique keys (one key per run)
2444
+ //!
2445
+ //! @param[in] d_values_in
2446
+ //! Pointer to the input sequence of corresponding values
2447
+ //!
2448
+ //! @param[out] d_aggregates_out
2449
+ //! Pointer to the output sequence of value aggregates
2450
+ //! (one aggregate per run)
2451
+ //!
2452
+ //! @param[out] d_num_runs_out
2453
+ //! Pointer to total number of runs encountered
2454
+ //! (i.e., the length of ``d_unique_out``)
2455
+ //!
2456
+ //! @param[in] reduction_op
2457
+ //! Binary reduction functor
2458
+ //!
2459
+ //! @param[in] num_items
2460
+ //! Total number of associated key+value pairs
2461
+ //! (i.e., the length of ``d_in_keys`` and ``d_in_values``)
2462
+ //!
2463
+ //! @param[in] stream
2464
+ //! @rst
2465
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2466
+ //! @endrst
2467
+ template <typename KeysInputIteratorT,
2468
+ typename UniqueOutputIteratorT,
2469
+ typename ValuesInputIteratorT,
2470
+ typename AggregatesOutputIteratorT,
2471
+ typename NumRunsOutputIteratorT,
2472
+ typename ReductionOpT,
2473
+ typename NumItemsT>
2474
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
2475
+ void* d_temp_storage,
2476
+ size_t& temp_storage_bytes,
2477
+ KeysInputIteratorT d_keys_in,
2478
+ UniqueOutputIteratorT d_unique_out,
2479
+ ValuesInputIteratorT d_values_in,
2480
+ AggregatesOutputIteratorT d_aggregates_out,
2481
+ NumRunsOutputIteratorT d_num_runs_out,
2482
+ ReductionOpT reduction_op,
2483
+ NumItemsT num_items,
2484
+ cudaStream_t stream = 0)
2485
+ {
2486
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
2487
+
2488
+ // Signed integer type for global offsets
2489
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2490
+
2491
+ // FlagT iterator type (not used)
2492
+
2493
+ // Selection op (not used)
2494
+
2495
+ // Default == operator
2496
+ using EqualityOp = ::cuda::std::equal_to<>;
2497
+
2498
+ return DispatchReduceByKey<
2499
+ KeysInputIteratorT,
2500
+ UniqueOutputIteratorT,
2501
+ ValuesInputIteratorT,
2502
+ AggregatesOutputIteratorT,
2503
+ NumRunsOutputIteratorT,
2504
+ EqualityOp,
2505
+ ReductionOpT,
2506
+ OffsetT>::Dispatch(d_temp_storage,
2507
+ temp_storage_bytes,
2508
+ d_keys_in,
2509
+ d_unique_out,
2510
+ d_values_in,
2511
+ d_aggregates_out,
2512
+ d_num_runs_out,
2513
+ EqualityOp(),
2514
+ reduction_op,
2515
+ static_cast<OffsetT>(num_items),
2516
+ stream);
2517
+ }
2518
+ };
2519
+ CUB_NAMESPACE_END