cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2303 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
7
+ //! items residing within device-accessible memory.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/detail/choose_offset.cuh>
22
+ #include <cub/detail/device_memory_resource.cuh>
23
+ #include <cub/detail/env_dispatch.cuh>
24
+ #include <cub/detail/temporary_storage.cuh>
25
+ #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
26
+ #include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
27
+ #include <cub/device/dispatch/dispatch_reduce_nondeterministic.cuh>
28
+ #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
29
+ #include <cub/thread/thread_operators.cuh>
30
+ #include <cub/util_type.cuh>
31
+
32
+ #include <cuda/__execution/determinism.h>
33
+ #include <cuda/__execution/require.h>
34
+ #include <cuda/__execution/tune.h>
35
+ #include <cuda/__functional/maximum.h>
36
+ #include <cuda/__functional/minimum.h>
37
+ #include <cuda/__iterator/tabulate_output_iterator.h>
38
+ #include <cuda/__memory_resource/get_memory_resource.h>
39
+ #include <cuda/__stream/get_stream.h>
40
+ #include <cuda/__stream/stream_ref.h>
41
+ #include <cuda/std/__execution/env.h>
42
+ #include <cuda/std/__functional/identity.h>
43
+ #include <cuda/std/__functional/invoke.h>
44
+ #include <cuda/std/__functional/operations.h>
45
+ #include <cuda/std/__type_traits/conditional.h>
46
+ #include <cuda/std/__type_traits/is_integral.h>
47
+ #include <cuda/std/__type_traits/is_same.h>
48
+ #include <cuda/std/cstdint>
49
+ #include <cuda/std/limits>
50
+
51
+ CUB_NAMESPACE_BEGIN
52
+
53
+ namespace detail
54
+ {
55
+ template <typename DeterminismT>
56
+ inline constexpr bool is_non_deterministic_v =
57
+ ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
58
+
59
+ namespace reduce
60
+ {
61
+ struct get_tuning_query_t
62
+ {};
63
+
64
+ template <class Derived>
65
+ struct tuning
66
+ {
67
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
68
+ {
69
+ return static_cast<const Derived&>(*this);
70
+ }
71
+ };
72
+
73
+ struct default_rfa_tuning : tuning<default_rfa_tuning>
74
+ {
75
+ template <class AccumT, class Offset, class OpT>
76
+ using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
77
+ };
78
+
79
+ template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
80
+ struct unzip_and_write_arg_extremum_op
81
+ {
82
+ ExtremumOutIteratorT result_out_it;
83
+ IndexOutIteratorT index_out_it;
84
+
85
+ template <typename IndexT, typename KeyValuePairT>
86
+ _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
87
+ {
88
+ *result_out_it = reduced_result.value;
89
+ *index_out_it = reduced_result.key;
90
+ }
91
+ };
92
+ } // namespace reduce
93
+ } // namespace detail
94
+
95
+ //! @rst
96
+ //! DeviceReduce provides device-wide, parallel operations for computing
97
+ //! a reduction across a sequence of data items residing within
98
+ //! device-accessible memory.
99
+ //!
100
+ //! .. image:: ../../img/reduce_logo.png
101
+ //! :align: center
102
+ //!
103
+ //! Overview
104
+ //! ====================================
105
+ //!
106
+ //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
107
+ //! (or *fold*) uses a binary combining operator to compute a single aggregate
108
+ //! from a sequence of input elements.
109
+ //!
110
+ //! Usage Considerations
111
+ //! ====================================
112
+ //!
113
+ //! @cdp_class{DeviceReduce}
114
+ //!
115
+ //! Performance
116
+ //! ====================================
117
+ //!
118
+ //! @linear_performance{reduction, reduce-by-key, and run-length encode}
119
+ //!
120
+ //! @endrst
121
+ struct DeviceReduce
122
+ {
123
+ private:
124
+ template <typename TuningEnvT,
125
+ typename InputIteratorT,
126
+ typename OutputIteratorT,
127
+ typename ReductionOpT,
128
+ typename TransformOpT,
129
+ typename T,
130
+ typename NumItemsT,
131
+ ::cuda::execution::determinism::__determinism_t Determinism>
132
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
133
+ void* d_temp_storage,
134
+ size_t& temp_storage_bytes,
135
+ InputIteratorT d_in,
136
+ OutputIteratorT d_out,
137
+ NumItemsT num_items,
138
+ ReductionOpT reduction_op,
139
+ TransformOpT transform_op,
140
+ T init,
141
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
142
+ cudaStream_t stream)
143
+ {
144
+ using offset_t = detail::choose_offset_t<NumItemsT>;
145
+ using accum_t = ::cuda::std::
146
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
147
+ using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
148
+ TuningEnvT,
149
+ detail::reduce::get_tuning_query_t,
150
+ detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
151
+
152
+ return detail::reduce::dispatch<accum_t>(
153
+ d_temp_storage,
154
+ temp_storage_bytes,
155
+ d_in,
156
+ d_out,
157
+ static_cast<offset_t>(num_items),
158
+ reduction_op,
159
+ init,
160
+ stream,
161
+ transform_op,
162
+ reduce_tuning_t{});
163
+ }
164
+
165
+ template <typename TuningEnvT,
166
+ typename InputIteratorT,
167
+ typename OutputIteratorT,
168
+ typename ReductionOpT,
169
+ typename TransformOpT,
170
+ typename T,
171
+ typename NumItemsT>
172
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
173
+ void* d_temp_storage,
174
+ size_t& temp_storage_bytes,
175
+ InputIteratorT d_in,
176
+ OutputIteratorT d_out,
177
+ NumItemsT num_items,
178
+ ReductionOpT,
179
+ TransformOpT transform_op,
180
+ T init,
181
+ ::cuda::execution::determinism::gpu_to_gpu_t,
182
+ cudaStream_t stream)
183
+ {
184
+ using offset_t = detail::choose_offset_t<NumItemsT>;
185
+
186
+ using reduce_tuning_t = ::cuda::std::execution::
187
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
188
+
189
+ using accum_t = ::cuda::std::
190
+ __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
191
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
192
+ using dispatch_t =
193
+ detail::rfa::dispatch_t<InputIteratorT, OutputIteratorT, offset_t, T, TransformOpT, accum_t, policy_t>;
194
+
195
+ return dispatch_t::Dispatch(
196
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream, transform_op);
197
+ }
198
+
199
+ template <typename TuningEnvT,
200
+ typename InputIteratorT,
201
+ typename OutputIteratorT,
202
+ typename ReductionOpT,
203
+ typename TransformOpT,
204
+ typename T,
205
+ typename NumItemsT>
206
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
207
+ void* d_temp_storage,
208
+ size_t& temp_storage_bytes,
209
+ InputIteratorT d_in,
210
+ OutputIteratorT d_out,
211
+ NumItemsT num_items,
212
+ ReductionOpT reduction_op,
213
+ TransformOpT transform_op,
214
+ T init,
215
+ ::cuda::execution::determinism::not_guaranteed_t,
216
+ cudaStream_t stream)
217
+ {
218
+ using offset_t = detail::choose_offset_t<NumItemsT>;
219
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
220
+
221
+ using reduce_tuning_t = ::cuda::std::execution::__query_result_or_t<
222
+ TuningEnvT,
223
+ detail::reduce::get_tuning_query_t,
224
+ detail::reduce::arch_policies_from_types<accum_t, offset_t, ReductionOpT>>;
225
+
226
+ return detail::reduce::dispatch_nondeterministic<accum_t>(
227
+ d_temp_storage,
228
+ temp_storage_bytes,
229
+ d_in,
230
+ THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(d_out),
231
+ static_cast<offset_t>(num_items),
232
+ reduction_op,
233
+ init,
234
+ stream,
235
+ transform_op,
236
+ reduce_tuning_t{});
237
+ }
238
+
239
+ public:
240
+ //! @rst
241
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
242
+ //!
243
+ //! - Does not support binary reduction operators that are non-commutative.
244
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
245
+ //! (e.g., addition of floating point types) on the same GPU device.
246
+ //! However, results for pseudo-associative reduction may be inconsistent
247
+ //! from one device to a another device of a different compute-capability
248
+ //! because CUB can employ different tile-sizing for different architectures.
249
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
250
+ //! - @devicestorage
251
+ //!
252
+ //! Snippet
253
+ //! +++++++++++++++++++++++++++++++++++++++++++++
254
+ //!
255
+ //! The code snippet below illustrates a user-defined min-reduction of a
256
+ //! device vector of ``int`` data elements.
257
+ //!
258
+ //! .. code-block:: c++
259
+ //!
260
+ //! #include <cub/cub.cuh>
261
+ //! // or equivalently <cub/device/device_reduce.cuh>
262
+ //!
263
+ //! // CustomMin functor
264
+ //! struct CustomMin
265
+ //! {
266
+ //! template <typename T>
267
+ //! __device__ __forceinline__
268
+ //! T operator()(const T &a, const T &b) const {
269
+ //! return (b < a) ? b : a;
270
+ //! }
271
+ //! };
272
+ //!
273
+ //! // Declare, allocate, and initialize device-accessible pointers for
274
+ //! // input and output
275
+ //! int num_items; // e.g., 7
276
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
277
+ //! int *d_out; // e.g., [-]
278
+ //! CustomMin min_op;
279
+ //! int init; // e.g., INT_MAX
280
+ //! ...
281
+ //!
282
+ //! // Determine temporary device storage requirements
283
+ //! void *d_temp_storage = nullptr;
284
+ //! size_t temp_storage_bytes = 0;
285
+ //! cub::DeviceReduce::Reduce(
286
+ //! d_temp_storage, temp_storage_bytes,
287
+ //! d_in, d_out, num_items, min_op, init);
288
+ //!
289
+ //! // Allocate temporary storage
290
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
291
+ //!
292
+ //! // Run reduction
293
+ //! cub::DeviceReduce::Reduce(
294
+ //! d_temp_storage, temp_storage_bytes,
295
+ //! d_in, d_out, num_items, min_op, init);
296
+ //!
297
+ //! // d_out <-- [0]
298
+ //!
299
+ //! @endrst
300
+ //!
301
+ //! @tparam InputIteratorT
302
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
303
+ //!
304
+ //! @tparam OutputIteratorT
305
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
306
+ //!
307
+ //! @tparam ReductionOpT
308
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
309
+ //!
310
+ //! @tparam T
311
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
312
+ //!
313
+ //! @tparam NumItemsT
314
+ //! **[inferred]** Type of num_items
315
+ //!
316
+ //! @param[in] d_temp_storage
317
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
318
+ //! required allocation size is written to `temp_storage_bytes` and no work
319
+ //! is done.
320
+ //!
321
+ //! @param[in,out] temp_storage_bytes
322
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
323
+ //!
324
+ //! @param[in] d_in
325
+ //! Pointer to the input sequence of data items
326
+ //!
327
+ //! @param[out] d_out
328
+ //! Pointer to the output aggregate
329
+ //!
330
+ //! @param[in] num_items
331
+ //! Total number of input items (i.e., length of ``d_in``)
332
+ //!
333
+ //! @param[in] reduction_op
334
+ //! Binary reduction functor
335
+ //!
336
+ //! @param[in] init
337
+ //! Initial value of the reduction
338
+ //!
339
+ //! @param[in] stream
340
+ //! @rst
341
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
342
+ //! @endrst
343
+ template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
344
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
345
+ void* d_temp_storage,
346
+ size_t& temp_storage_bytes,
347
+ InputIteratorT d_in,
348
+ OutputIteratorT d_out,
349
+ NumItemsT num_items,
350
+ ReductionOpT reduction_op,
351
+ T init,
352
+ cudaStream_t stream = 0)
353
+ {
354
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
355
+
356
+ // Signed integer type for global offsets
357
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
358
+
359
+ return detail::reduce::dispatch(
360
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
361
+ }
362
+
363
+ //! @rst
364
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
365
+ //!
366
+ //! - Does not support binary reduction operators that are non-commutative.
367
+ //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
368
+ //! (e.g., addition of floating point types) on the same GPU device.
369
+ //! However, results for pseudo-associative reduction may be inconsistent
370
+ //! from one device to a another device of a different compute-capability
371
+ //! because CUB can employ different tile-sizing for different architectures.
372
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
373
+ //! as the `env` parameter.
374
+ //! To request "not-guaranteed" determinism, pass
375
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
376
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
377
+ //!
378
+ //! Snippet
379
+ //! +++++++++++++++++++++++++++++++++++++++++++++
380
+ //!
381
+ //! The code snippet below illustrates a user-defined min-reduction of a
382
+ //! device vector of ``int`` data elements.
383
+ //!
384
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
385
+ //! :language: c++
386
+ //! :dedent:
387
+ //! :start-after: example-begin reduce-env-determinism
388
+ //! :end-before: example-end reduce-env-determinism
389
+ //!
390
+ //! @endrst
391
+ //!
392
+ //! @tparam InputIteratorT
393
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
394
+ //!
395
+ //! @tparam OutputIteratorT
396
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
397
+ //!
398
+ //! @tparam ReductionOpT
399
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
400
+ //!
401
+ //! @tparam T
402
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
403
+ //!
404
+ //! @tparam NumItemsT
405
+ //! **[inferred]** Type of num_items
406
+ //!
407
+ //! @tparam EnvT
408
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
409
+ //!
410
+ //! @param[in] d_in
411
+ //! Pointer to the input sequence of data items
412
+ //!
413
+ //! @param[out] d_out
414
+ //! Pointer to the output aggregate
415
+ //!
416
+ //! @param[in] num_items
417
+ //! Total number of input items (i.e., length of ``d_in``)
418
+ //!
419
+ //! @param[in] reduction_op
420
+ //! Binary reduction functor
421
+ //!
422
+ //! @param[in] init
423
+ //! Initial value of the reduction
424
+ //!
425
+ //! @param[in] env
426
+ //! @rst
427
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
428
+ //! @endrst
429
+ template <typename InputIteratorT,
430
+ typename OutputIteratorT,
431
+ typename ReductionOpT,
432
+ typename T,
433
+ typename NumItemsT,
434
+ typename EnvT = ::cuda::std::execution::env<>>
435
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
436
+ InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
437
+ {
438
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
439
+
440
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
441
+ "Determinism should be used inside requires to have an effect.");
442
+ using requirements_t = ::cuda::std::execution::
443
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
444
+ using default_determinism_t =
445
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
446
+ ::cuda::execution::determinism::__get_determinism_t,
447
+ ::cuda::execution::determinism::run_to_run_t>;
448
+
449
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
450
+
451
+ constexpr auto gpu_gpu_determinism =
452
+ ::cuda::std::is_same_v<default_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>;
453
+
454
+ // integral types are always gpu-to-gpu deterministic if reduction operator is a simple cuda binary
455
+ // operator, so fallback to run-to-run determinism
456
+ constexpr auto integral_fallback =
457
+ gpu_gpu_determinism && ::cuda::std::is_integral_v<accum_t> && (detail::is_cuda_binary_operator<ReductionOpT>);
458
+
459
+ // use gpu-to-gpu determinism only for float and double types with ::cuda::std::plus operator
460
+ constexpr auto float_double_plus =
461
+ gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
462
+
463
+ constexpr auto float_double_min_max_fallback =
464
+ gpu_gpu_determinism
465
+ && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_minimum_maximum_v<ReductionOpT>;
466
+
467
+ constexpr auto supported =
468
+ integral_fallback || float_double_plus || float_double_min_max_fallback || !gpu_gpu_determinism;
469
+
470
+ // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
471
+ // float and double types with ::cuda::std::plus operator
472
+ static_assert(supported, "gpu_to_gpu determinism is unsupported");
473
+
474
+ if constexpr (!supported)
475
+ {
476
+ return cudaErrorNotSupported;
477
+ }
478
+ else
479
+ {
480
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
481
+
482
+ // Certain conditions must be met to be able to use the non-deterministic
483
+ // kernel. The output iterator must be a contiguous iterator and the
484
+ // reduction operator must be plus (for now). Additionally, since atomics for types of
485
+ // size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
486
+ // determinism.
487
+ constexpr auto is_contiguous_fallback =
488
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
489
+ constexpr auto is_plus_fallback = !no_determinism || detail::is_cuda_std_plus_v<ReductionOpT>;
490
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(accum_t) >= 4;
491
+
492
+ // If the conditions for gpu-to-gpu determinism or non-deterministic
493
+ // reduction are not met, we fall back to run-to-run determinism.
494
+ using determinism_t = ::cuda::std::conditional_t<
495
+ (gpu_gpu_determinism && (integral_fallback || float_double_min_max_fallback))
496
+ || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
497
+ ::cuda::execution::determinism::run_to_run_t,
498
+ default_determinism_t>;
499
+
500
+ // Dispatch with environment - handles all boilerplate
501
+ return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
502
+ using tuning_t = decltype(tuning);
503
+ return reduce_impl<tuning_t>(
504
+ storage, bytes, d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, determinism_t{}, stream);
505
+ });
506
+ }
507
+ }
508
+
509
+ //! @rst
510
+ //! Computes a device-wide sum using the addition (``+``) operator.
511
+ //!
512
+ //! - Uses ``0`` as the initial value of the reduction.
513
+ //! - Does not support ``+`` operators that are non-commutative.
514
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
515
+ //! (e.g., addition of floating point types) on the same GPU device.
516
+ //! However, results for pseudo-associative reduction may be inconsistent
517
+ //! from one device to a another device of a different compute-capability
518
+ //! because CUB can employ different tile-sizing for different architectures.
519
+ //! To request "gpu-to-gpu" determinism, pass ``cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)``
520
+ //! as the `env` parameter.
521
+ //! To request "not-guaranteed" determinism, pass
522
+ //! ``cuda::execution::require(cuda::execution::determinism::not_guaranteed)`` as the `env` parameter.
523
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
524
+ //!
525
+ //! Snippet
526
+ //! +++++++++++++++++++++++++++++++++++++++++++++
527
+ //!
528
+ //! The code snippet below illustrates a user-defined min-reduction of a
529
+ //! device vector of ``int`` data elements.
530
+ //!
531
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
532
+ //! :language: c++
533
+ //! :dedent:
534
+ //! :start-after: example-begin sum-env-determinism
535
+ //! :end-before: example-end sum-env-determinism
536
+ //!
537
+ //! @endrst
538
+ //!
539
+ //! @tparam InputIteratorT
540
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
541
+ //!
542
+ //! @tparam OutputIteratorT
543
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
544
+ //!
545
+ //! @tparam NumItemsT
546
+ //! **[inferred]** Type of num_items
547
+ //!
548
+ //! @tparam EnvT
549
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
550
+ //!
551
+ //! @param[in] d_in
552
+ //! Pointer to the input sequence of data items
553
+ //!
554
+ //! @param[out] d_out
555
+ //! Pointer to the output aggregate
556
+ //!
557
+ //! @param[in] num_items
558
+ //! Total number of input items (i.e., length of ``d_in``)
559
+ //!
560
+ //! @param[in] env
561
+ //! @rst
562
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
563
+ //! @endrst
564
+ template <typename InputIteratorT,
565
+ typename OutputIteratorT,
566
+ typename NumItemsT,
567
+ typename EnvT = ::cuda::std::execution::env<>>
568
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
569
+ Sum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
570
+ {
571
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
572
+
573
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
574
+ "Determinism should be used inside requires to have an effect.");
575
+ using requirements_t = ::cuda::std::execution::
576
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
577
+ using default_determinism_t =
578
+ ::cuda::std::execution::__query_result_or_t<requirements_t,
579
+ ::cuda::execution::determinism::__get_determinism_t,
580
+ ::cuda::execution::determinism::run_to_run_t>;
581
+
582
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
583
+
584
+ // The output iterator must be a contiguous iterator or we fall back to run-to-run determinism.
585
+ constexpr auto is_contiguous_fallback =
586
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
587
+
588
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
589
+
590
+ // Since atomics for types of size < 4B are emulated, they perform poorly, so we fall back to run-to-run
591
+ // determinism.
592
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(OutputT) >= 4;
593
+
594
+ using determinism_t =
595
+ ::cuda::std::conditional_t<no_determinism && !(is_contiguous_fallback && is_4b_or_greater),
596
+ ::cuda::execution::determinism::run_to_run_t,
597
+ default_determinism_t>;
598
+
599
+ using InitT = OutputT;
600
+
601
+ // Dispatch with environment - handles all boilerplate
602
+ return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
603
+ using tuning_t = decltype(tuning);
604
+ return reduce_impl<tuning_t>(
605
+ storage,
606
+ bytes,
607
+ d_in,
608
+ d_out,
609
+ num_items,
610
+ ::cuda::std::plus<>{},
611
+ ::cuda::std::identity{},
612
+ InitT{},
613
+ determinism_t{},
614
+ stream);
615
+ });
616
+ }
617
+
618
+ //! @rst
619
+ //! Computes a device-wide sum using the addition (``+``) operator.
620
+ //!
621
+ //! - Uses ``0`` as the initial value of the reduction.
622
+ //! - Does not support ``+`` operators that are non-commutative.
623
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
624
+ //! (e.g., addition of floating point types) on the same GPU device.
625
+ //! However, results for pseudo-associative reduction may be inconsistent
626
+ //! from one device to a another device of a different compute-capability
627
+ //! because CUB can employ different tile-sizing for different architectures.
628
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
629
+ //! - @devicestorage
630
+ //!
631
+ //! Snippet
632
+ //! +++++++++++++++++++++++++++++++++++++++++++++
633
+ //!
634
+ //! The code snippet below illustrates the sum-reduction of a device vector
635
+ //! of ``int`` data elements.
636
+ //!
637
+ //! .. code-block:: c++
638
+ //!
639
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
640
+ //!
641
+ //! // Declare, allocate, and initialize device-accessible pointers
642
+ //! // for input and output
643
+ //! int num_items; // e.g., 7
644
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
645
+ //! int *d_out; // e.g., [-]
646
+ //! ...
647
+ //!
648
+ //! // Determine temporary device storage requirements
649
+ //! void *d_temp_storage = nullptr;
650
+ //! size_t temp_storage_bytes = 0;
651
+ //! cub::DeviceReduce::Sum(
652
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
653
+ //!
654
+ //! // Allocate temporary storage
655
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
656
+ //!
657
+ //! // Run sum-reduction
658
+ //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
659
+ //!
660
+ //! // d_out <-- [38]
661
+ //!
662
+ //! @endrst
663
+ //!
664
+ //! @tparam InputIteratorT
665
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
666
+ //!
667
+ //! @tparam OutputIteratorT
668
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
669
+ //!
670
+ //! @tparam NumItemsT
671
+ //! **[inferred]** Type of num_items
672
+ //!
673
+ //! @param[in] d_temp_storage
674
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
675
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
676
+ //!
677
+ //! @param[in,out] temp_storage_bytes
678
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
679
+ //!
680
+ //! @param[in] d_in
681
+ //! Pointer to the input sequence of data items
682
+ //!
683
+ //! @param[out] d_out
684
+ //! Pointer to the output aggregate
685
+ //!
686
+ //! @param[in] num_items
687
+ //! Total number of input items (i.e., length of `d_in`)
688
+ //!
689
+ //! @param[in] stream
690
+ //! @rst
691
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
692
+ //! @endrst
693
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
694
+ CUB_RUNTIME_FUNCTION static cudaError_t
695
+ Sum(void* d_temp_storage,
696
+ size_t& temp_storage_bytes,
697
+ InputIteratorT d_in,
698
+ OutputIteratorT d_out,
699
+ NumItemsT num_items,
700
+ cudaStream_t stream = 0)
701
+ {
702
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
703
+
704
+ // Signed integer type for global offsets
705
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
706
+
707
+ // The output value type
708
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
709
+
710
+ using InitT = OutputT;
711
+
712
+ return detail::reduce::dispatch(
713
+ d_temp_storage,
714
+ temp_storage_bytes,
715
+ d_in,
716
+ d_out,
717
+ static_cast<OffsetT>(num_items),
718
+ ::cuda::std::plus<>{},
719
+ InitT{}, // zero-initialize
720
+ stream);
721
+ }
722
+
723
+ //! @rst
724
+ //! Computes a device-wide minimum using the less-than (``<``) operator.
725
+ //!
726
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
727
+ //! - Does not support ``<`` operators that are non-commutative.
728
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
729
+ //! (e.g., addition of floating point types) on the same GPU device.
730
+ //! However, results for pseudo-associative reduction may be inconsistent
731
+ //! from one device to a another device of a different compute-capability
732
+ //! because CUB can employ different tile-sizing for different architectures.
733
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
734
+ //! - @devicestorage
735
+ //!
736
+ //! Snippet
737
+ //! +++++++++++++++++++++++++++++++++++++++++++++
738
+ //!
739
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
740
+ //!
741
+ //! .. code-block:: c++
742
+ //!
743
+ //! #include <cub/cub.cuh>
744
+ //! // or equivalently <cub/device/device_reduce.cuh>
745
+ //!
746
+ //! // Declare, allocate, and initialize device-accessible pointers
747
+ //! // for input and output
748
+ //! int num_items; // e.g., 7
749
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
750
+ //! int *d_out; // e.g., [-]
751
+ //! ...
752
+ //!
753
+ //! // Determine temporary device storage requirements
754
+ //! void *d_temp_storage = nullptr;
755
+ //! size_t temp_storage_bytes = 0;
756
+ //! cub::DeviceReduce::Min(
757
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
758
+ //!
759
+ //! // Allocate temporary storage
760
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
761
+ //!
762
+ //! // Run min-reduction
763
+ //! cub::DeviceReduce::Min(
764
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
765
+ //!
766
+ //! // d_out <-- [0]
767
+ //!
768
+ //! @endrst
769
+ //!
770
+ //! @tparam InputIteratorT
771
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
772
+ //!
773
+ //! @tparam OutputIteratorT
774
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
775
+ //!
776
+ //! @tparam NumItemsT
777
+ //! **[inferred]** Type of num_items
778
+ //!
779
+ //! @param[in] d_temp_storage
780
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
781
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
782
+ //!
783
+ //! @param[in,out] temp_storage_bytes
784
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
785
+ //!
786
+ //! @param[in] d_in
787
+ //! Pointer to the input sequence of data items
788
+ //!
789
+ //! @param[out] d_out
790
+ //! Pointer to the output aggregate
791
+ //!
792
+ //! @param[in] num_items
793
+ //! Total number of input items (i.e., length of ``d_in``)
794
+ //!
795
+ //! @param[in] stream
796
+ //! @rst
797
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
798
+ //! @endrst
799
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
800
+ CUB_RUNTIME_FUNCTION static cudaError_t
801
+ Min(void* d_temp_storage,
802
+ size_t& temp_storage_bytes,
803
+ InputIteratorT d_in,
804
+ OutputIteratorT d_out,
805
+ NumItemsT num_items,
806
+ cudaStream_t stream = 0)
807
+ {
808
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
809
+
810
+ using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
811
+ using InputT = detail::it_value_t<InputIteratorT>;
812
+ using InitT = InputT;
813
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
814
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
815
+ static_assert(limits_t::is_specialized,
816
+ "cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
817
+ "value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This is "
818
+ "probably a bug and you should specialize cuda::std::numeric_limits. Define "
819
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
820
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
821
+
822
+ return detail::reduce::dispatch(
823
+ d_temp_storage,
824
+ temp_storage_bytes,
825
+ d_in,
826
+ d_out,
827
+ static_cast<OffsetT>(num_items),
828
+ ::cuda::minimum<>{},
829
+ limits_t::max(),
830
+ stream);
831
+ }
832
+
833
+ //! @rst
834
+ //! Computes a device-wide minimum using the less-than (``<``) operator. The result is written to the output
835
+ //! iterator.
836
+ //!
837
+ //! - Uses ``cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
838
+ //! - Provides determinism based on the environment's determinism requirements.
839
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
840
+ //! as the `env` parameter.
841
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
842
+ //!
843
+ //! Snippet
844
+ //! +++++++++++++++++++++++++++++++++++++++++++++
845
+ //!
846
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
847
+ //!
848
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
849
+ //! :language: c++
850
+ //! :dedent:
851
+ //! :start-after: example-begin min-env-determinism
852
+ //! :end-before: example-end min-env-determinism
853
+ //!
854
+ //! @endrst
855
+ //!
856
+ //! @tparam InputIteratorT
857
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
858
+ //!
859
+ //! @tparam OutputIteratorT
860
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
861
+ //!
862
+ //! @tparam NumItemsT
863
+ //! **[inferred]** Type of num_items
864
+ //!
865
+ //! @tparam EnvT
866
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
867
+ //!
868
+ //! @param[in] d_in
869
+ //! Pointer to the input sequence of data items
870
+ //!
871
+ //! @param[out] d_out
872
+ //! Pointer to the output aggregate
873
+ //!
874
+ //! @param[in] num_items
875
+ //! Total number of input items (i.e., length of ``d_in``)
876
+ //!
877
+ //! @param[in] env
878
+ //! @rst
879
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
880
+ //! @endrst
881
+ template <typename InputIteratorT,
882
+ typename OutputIteratorT,
883
+ typename NumItemsT,
884
+ typename EnvT = ::cuda::std::execution::env<>>
885
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
886
+ Min(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
887
+ {
888
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Min");
889
+
890
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
891
+ "Determinism should be used inside requires to have an effect.");
892
+ using requirements_t = ::cuda::std::execution::
893
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
894
+ using requested_determinism_t =
895
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
896
+ ::cuda::execution::determinism::__get_determinism_t,
897
+ ::cuda::execution::determinism::run_to_run_t>;
898
+
899
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
900
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
901
+ "gpu_to_gpu determinism is not supported");
902
+
903
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
904
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
905
+
906
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
907
+
908
+ using InitT = OutputT;
909
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
910
+ // Dispatch with environment - handles all boilerplate
911
+ return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
912
+ using tuning_t = decltype(tuning);
913
+ return reduce_impl<tuning_t>(
914
+ storage,
915
+ bytes,
916
+ d_in,
917
+ d_out,
918
+ num_items,
919
+ ::cuda::minimum<>{},
920
+ ::cuda::std::identity{},
921
+ limits_t::max(),
922
+ determinism_t{},
923
+ stream);
924
+ });
925
+ }
926
+
927
+ //! @rst
928
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
929
+ //!
930
+ //! - The minimum is written to ``d_min_out``
931
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
932
+ //! ``cuda::std::int64_t``.
933
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
934
+ //! ``1`` is written to ``d_index_out``.
935
+ //! - Does not support ``<`` operators that are non-commutative.
936
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
937
+ //! (e.g., addition of floating point types) on the same GPU device.
938
+ //! However, results for pseudo-associative reduction may be inconsistent
939
+ //! from one device to a another device of a different compute-capability
940
+ //! because CUB can employ different tile-sizing for different architectures.
941
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
942
+ //! - @devicestorage
943
+ //!
944
+ //! Snippet
945
+ //! +++++++++++++++++++++++++++++++++++++++++++++
946
+ //!
947
+ //! The code snippet below illustrates the argmin-reduction of a device vector
948
+ //! of ``int`` data elements.
949
+ //!
950
+ //! .. code-block:: c++
951
+ //!
952
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
953
+ //! #include <cuda/std/cstdint>
954
+ //!
955
+ //! // Declare, allocate, and initialize device-accessible pointers
956
+ //! // for input and output
957
+ //! int num_items; // e.g., 7
958
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
959
+ //! int *d_min_out; // memory for the minimum value
960
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
961
+ //! ...
962
+ //!
963
+ //! // Determine temporary device storage requirements
964
+ //! void *d_temp_storage = nullptr;
965
+ //! size_t temp_storage_bytes = 0;
966
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
967
+ //! num_items);
968
+ //!
969
+ //! // Allocate temporary storage
970
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
971
+ //!
972
+ //! // Run argmin-reduction
973
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
974
+ //! num_items);
975
+ //!
976
+ //! // d_min_out <-- 0
977
+ //! // d_index_out <-- 5
978
+ //!
979
+ //! @endrst
980
+ //!
981
+ //! @tparam InputIteratorT
982
+ //! **[inferred]** Random-access input iterator type for reading input items
983
+ //! (of some type `T`) @iterator
984
+ //!
985
+ //! @tparam ExtremumOutIteratorT
986
+ //! **[inferred]** Output iterator type for recording minimum value
987
+ //!
988
+ //! @tparam IndexOutIteratorT
989
+ //! **[inferred]** Output iterator type for recording index of the returned value
990
+ //!
991
+ //! @param[in] d_temp_storage
992
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
993
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
994
+ //!
995
+ //! @param[in,out] temp_storage_bytes
996
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
997
+ //!
998
+ //! @param[in] d_in
999
+ //! Iterator to the input sequence of data items
1000
+ //!
1001
+ //! @param[out] d_min_out
1002
+ //! Iterator to which the minimum value is written
1003
+ //!
1004
+ //! @param[out] d_index_out
1005
+ //! Iterator to which the index of the returned value is written
1006
+ //!
1007
+ //! @param[in] num_items
1008
+ //! Total number of input items (i.e., length of ``d_in``)
1009
+ //!
1010
+ //! @param[in] stream
1011
+ //! @rst
1012
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1013
+ //! @endrst
1014
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1015
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
1016
+ void* d_temp_storage,
1017
+ size_t& temp_storage_bytes,
1018
+ InputIteratorT d_in,
1019
+ ExtremumOutIteratorT d_min_out,
1020
+ IndexOutIteratorT d_index_out,
1021
+ ::cuda::std::int64_t num_items,
1022
+ cudaStream_t stream = 0)
1023
+ {
1024
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1025
+
1026
+ // The input type
1027
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1028
+
1029
+ // Offset type used within the kernel and to index within one partition
1030
+ using PerPartitionOffsetT = int;
1031
+
1032
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1033
+ using GlobalOffsetT = ::cuda::std::int64_t;
1034
+
1035
+ // The value type used for the extremum
1036
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1037
+ using InitT = OutputExtremumT;
1038
+
1039
+ // Reduction operation
1040
+ using ReduceOpT = cub::ArgMin;
1041
+
1042
+ // Initial value
1043
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1044
+
1045
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1046
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1047
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1048
+
1049
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1050
+ InputIteratorT,
1051
+ decltype(out_it),
1052
+ PerPartitionOffsetT,
1053
+ GlobalOffsetT,
1054
+ ReduceOpT,
1055
+ InitT>::Dispatch(d_temp_storage,
1056
+ temp_storage_bytes,
1057
+ d_in,
1058
+ out_it,
1059
+ static_cast<GlobalOffsetT>(num_items),
1060
+ ReduceOpT{},
1061
+ initial_value,
1062
+ stream);
1063
+ }
1064
+
1065
+ //! @rst
1066
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
1067
+ //!
1068
+ //! - The minimum is written to ``d_min_out``
1069
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1070
+ //! ``cuda::std::int64_t``.
1071
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
1072
+ //! ``1`` is written to ``d_index_out``.
1073
+ //! - Does not support ``<`` operators that are non-commutative.
1074
+ //! - Provides determinism based on the environment's determinism requirements.
1075
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1076
+ //! as the `env` parameter.
1077
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
1078
+ //!
1079
+ //! Snippet
1080
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1081
+ //!
1082
+ //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
1083
+ //!
1084
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1085
+ //! :language: c++
1086
+ //! :dedent:
1087
+ //! :start-after: example-begin argmin-env-determinism
1088
+ //! :end-before: example-end argmin-env-determinism
1089
+ //!
1090
+ //! @endrst
1091
+ //!
1092
+ //! @tparam InputIteratorT
1093
+ //! **[inferred]** Random-access input iterator type for reading input items
1094
+ //! (of some type `T`) @iterator
1095
+ //!
1096
+ //! @tparam ExtremumOutIteratorT
1097
+ //! **[inferred]** Output iterator type for recording minimum value
1098
+ //!
1099
+ //! @tparam IndexOutIteratorT
1100
+ //! **[inferred]** Output iterator type for recording index of the returned value
1101
+ //!
1102
+ //! @tparam EnvT
1103
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
1104
+ //!
1105
+ //! @param[in] d_in
1106
+ //! Iterator to the input sequence of data items
1107
+ //!
1108
+ //! @param[out] d_min_out
1109
+ //! Iterator to which the minimum value is written
1110
+ //!
1111
+ //! @param[out] d_index_out
1112
+ //! Iterator to which the index of the returned value is written
1113
+ //!
1114
+ //! @param[in] num_items
1115
+ //! Total number of input items (i.e., length of ``d_in``)
1116
+ //!
1117
+ //! @param[in] env
1118
+ //! @rst
1119
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1120
+ //! @endrst
1121
+ template <typename InputIteratorT,
1122
+ typename ExtremumOutIteratorT,
1123
+ typename IndexOutIteratorT,
1124
+ typename EnvT = ::cuda::std::execution::env<>>
1125
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1126
+ ArgMin(InputIteratorT d_in,
1127
+ ExtremumOutIteratorT d_min_out,
1128
+ IndexOutIteratorT d_index_out,
1129
+ ::cuda::std::int64_t num_items,
1130
+ EnvT env = {})
1131
+ {
1132
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMin");
1133
+
1134
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
1135
+ "Determinism should be used inside requires to have an effect.");
1136
+ using requirements_t = ::cuda::std::execution::
1137
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
1138
+ using requested_determinism_t =
1139
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
1140
+ ::cuda::execution::determinism::__get_determinism_t,
1141
+ ::cuda::execution::determinism::run_to_run_t>;
1142
+
1143
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1144
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1145
+ "gpu_to_gpu determinism is not supported");
1146
+
1147
+ // Query relevant properties from the environment
1148
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1149
+ auto mr =
1150
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1151
+
1152
+ void* d_temp_storage = nullptr;
1153
+ size_t temp_storage_bytes = 0;
1154
+
1155
+ using tuning_t =
1156
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
1157
+
1158
+ // Reduction operation
1159
+ using ReduceOpT = cub::ArgMin;
1160
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1161
+ using PerPartitionOffsetT = int;
1162
+ using GlobalOffsetT = ::cuda::std::int64_t;
1163
+
1164
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1165
+ using InitT = OutputExtremumT;
1166
+
1167
+ // Initial value
1168
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1169
+
1170
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1171
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1172
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1173
+
1174
+ // Query the required temporary storage size
1175
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
1176
+ InputIteratorT,
1177
+ decltype(out_it),
1178
+ PerPartitionOffsetT,
1179
+ GlobalOffsetT,
1180
+ ReduceOpT,
1181
+ InitT>::Dispatch(d_temp_storage,
1182
+ temp_storage_bytes,
1183
+ d_in,
1184
+ out_it,
1185
+ static_cast<GlobalOffsetT>(num_items),
1186
+ ReduceOpT{},
1187
+ initial_value,
1188
+ stream.get());
1189
+ if (error != cudaSuccess)
1190
+ {
1191
+ return error;
1192
+ }
1193
+
1194
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1195
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1196
+ if (error != cudaSuccess)
1197
+ {
1198
+ return error;
1199
+ }
1200
+
1201
+ // Run the algorithm
1202
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
1203
+ InputIteratorT,
1204
+ decltype(out_it),
1205
+ PerPartitionOffsetT,
1206
+ GlobalOffsetT,
1207
+ ReduceOpT,
1208
+ InitT>::Dispatch(d_temp_storage,
1209
+ temp_storage_bytes,
1210
+ d_in,
1211
+ out_it,
1212
+ static_cast<GlobalOffsetT>(num_items),
1213
+ ReduceOpT{},
1214
+ initial_value,
1215
+ stream.get());
1216
+
1217
+ // Try to deallocate regardless of the error to avoid memory leaks
1218
+ cudaError_t deallocate_error =
1219
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1220
+
1221
+ if (error != cudaSuccess)
1222
+ {
1223
+ // Reduction error takes precedence over deallocation error since it happens first
1224
+ return error;
1225
+ }
1226
+
1227
+ return deallocate_error;
1228
+ }
1229
+
1230
+ //! @rst
1231
+ //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
1232
+ //!
1233
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1234
+ //! (assuming the value type of ``d_in`` is ``T``)
1235
+ //!
1236
+ //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
1237
+ //! - The ``{1, cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
1238
+ //!
1239
+ //! - Does not support ``<`` operators that are non-commutative.
1240
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1241
+ //! (e.g., addition of floating point types) on the same GPU device.
1242
+ //! However, results for pseudo-associative reduction may be inconsistent
1243
+ //! from one device to a another device of a different compute-capability
1244
+ //! because CUB can employ different tile-sizing for different architectures.
1245
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1246
+ //! - @devicestorage
1247
+ //!
1248
+ //! Snippet
1249
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1250
+ //!
1251
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1252
+ //! of ``int`` data elements.
1253
+ //!
1254
+ //! .. code-block:: c++
1255
+ //!
1256
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1257
+ //!
1258
+ //! // Declare, allocate, and initialize device-accessible pointers
1259
+ //! // for input and output
1260
+ //! int num_items; // e.g., 7
1261
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1262
+ //! KeyValuePair<int, int> *d_argmin; // e.g., [{-,-}]
1263
+ //! ...
1264
+ //!
1265
+ //! // Determine temporary device storage requirements
1266
+ //! void *d_temp_storage = nullptr;
1267
+ //! size_t temp_storage_bytes = 0;
1268
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1269
+ //!
1270
+ //! // Allocate temporary storage
1271
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1272
+ //!
1273
+ //! // Run argmin-reduction
1274
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1275
+ //!
1276
+ //! // d_argmin <-- [{5, 0}]
1277
+ //!
1278
+ //! @endrst
1279
+ //!
1280
+ //! @tparam InputIteratorT
1281
+ //! **[inferred]** Random-access input iterator type for reading input items
1282
+ //! (of some type `T`) @iterator
1283
+ //!
1284
+ //! @tparam OutputIteratorT
1285
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1286
+ //! (having value type ``cub::KeyValuePair<int, T>``) @iterator
1287
+ //!
1288
+ //! @param[in] d_temp_storage
1289
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1290
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1291
+ //!
1292
+ //! @param[in,out] temp_storage_bytes
1293
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1294
+ //!
1295
+ //! @param[in] d_in
1296
+ //! Pointer to the input sequence of data items
1297
+ //!
1298
+ //! @param[out] d_out
1299
+ //! Pointer to the output aggregate
1300
+ //!
1301
+ //! @param[in] num_items
1302
+ //! Total number of input items (i.e., length of ``d_in``)
1303
+ //!
1304
+ //! @param[in] stream
1305
+ //! @rst
1306
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1307
+ //! @endrst
1308
+ template <typename InputIteratorT, typename OutputIteratorT>
1309
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMin interface that takes two separate "
1310
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1311
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1312
+ ArgMin(void* d_temp_storage,
1313
+ size_t& temp_storage_bytes,
1314
+ InputIteratorT d_in,
1315
+ OutputIteratorT d_out,
1316
+ int num_items,
1317
+ cudaStream_t stream = 0)
1318
+ {
1319
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1320
+
1321
+ // Signed integer type for global offsets
1322
+ using OffsetT = int;
1323
+
1324
+ // The input type
1325
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1326
+
1327
+ // The output tuple type
1328
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1329
+
1330
+ using AccumT = OutputTupleT;
1331
+
1332
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1333
+
1334
+ // The output value type
1335
+ using OutputValueT = typename OutputTupleT::Value;
1336
+
1337
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1338
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1339
+
1340
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1341
+
1342
+ // Initial value
1343
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
1344
+
1345
+ return detail::reduce::dispatch<AccumT>(
1346
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMin(), initial_value, stream);
1347
+ }
1348
+
1349
+ //! @rst
1350
+ //! Computes a device-wide maximum using the greater-than (``>``) operator.
1351
+ //!
1352
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1353
+ //! - Does not support ``>`` operators that are non-commutative.
1354
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1355
+ //! (e.g., addition of floating point types) on the same GPU device.
1356
+ //! However, results for pseudo-associative reduction may be inconsistent
1357
+ //! from one device to a another device of a different compute-capability
1358
+ //! because CUB can employ different tile-sizing for different architectures.
1359
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1360
+ //! - @devicestorage
1361
+ //!
1362
+ //! Snippet
1363
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1364
+ //!
1365
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1366
+ //!
1367
+ //! .. code-block:: c++
1368
+ //!
1369
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1370
+ //!
1371
+ //! // Declare, allocate, and initialize device-accessible pointers
1372
+ //! // for input and output
1373
+ //! int num_items; // e.g., 7
1374
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1375
+ //! int *d_max; // e.g., [-]
1376
+ //! ...
1377
+ //!
1378
+ //! // Determine temporary device storage requirements
1379
+ //! void *d_temp_storage = nullptr;
1380
+ //! size_t temp_storage_bytes = 0;
1381
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1382
+ //!
1383
+ //! // Allocate temporary storage
1384
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1385
+ //!
1386
+ //! // Run max-reduction
1387
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1388
+ //!
1389
+ //! // d_max <-- [9]
1390
+ //!
1391
+ //! @endrst
1392
+ //!
1393
+ //! @tparam InputIteratorT
1394
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1395
+ //!
1396
+ //! @tparam OutputIteratorT
1397
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1398
+ //!
1399
+ //! @tparam NumItemsT
1400
+ //! **[inferred]** Type of num_items
1401
+ //!
1402
+ //! @param[in] d_temp_storage
1403
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1404
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1405
+ //!
1406
+ //! @param[in,out] temp_storage_bytes
1407
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1408
+ //!
1409
+ //! @param[in] d_in
1410
+ //! Pointer to the input sequence of data items
1411
+ //!
1412
+ //! @param[out] d_out
1413
+ //! Pointer to the output aggregate
1414
+ //!
1415
+ //! @param[in] num_items
1416
+ //! Total number of input items (i.e., length of ``d_in``)
1417
+ //!
1418
+ //! @param[in] stream
1419
+ //! @rst
1420
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1421
+ //! @endrst
1422
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1423
+ CUB_RUNTIME_FUNCTION static cudaError_t
1424
+ Max(void* d_temp_storage,
1425
+ size_t& temp_storage_bytes,
1426
+ InputIteratorT d_in,
1427
+ OutputIteratorT d_out,
1428
+ NumItemsT num_items,
1429
+ cudaStream_t stream = 0)
1430
+ {
1431
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
1432
+
1433
+ // Signed integer type for global offsets
1434
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1435
+ using InputT = detail::it_value_t<InputIteratorT>;
1436
+ using InitT = InputT;
1437
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1438
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1439
+ static_assert(limits_t::is_specialized,
1440
+ "cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
1441
+ "initial value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This "
1442
+ "is probably a bug and you should specialize cuda::std::numeric_limits. Define "
1443
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
1444
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1445
+
1446
+ return detail::reduce::dispatch(
1447
+ d_temp_storage,
1448
+ temp_storage_bytes,
1449
+ d_in,
1450
+ d_out,
1451
+ static_cast<OffsetT>(num_items),
1452
+ ::cuda::maximum<>{},
1453
+ limits_t::lowest(),
1454
+ stream);
1455
+ }
1456
+
1457
+ //! @rst
1458
+ //! Computes a device-wide maximum using the greater-than (``>``) operator. The result is written to the output
1459
+ //! iterator.
1460
+ //!
1461
+ //! - Uses ``cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1462
+ //! - Provides determinism based on the environment's determinism requirements.
1463
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1464
+ //! as the `env` parameter.
1465
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1466
+ //!
1467
+ //! Snippet
1468
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1469
+ //!
1470
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1471
+ //!
1472
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1473
+ //! :language: c++
1474
+ //! :dedent:
1475
+ //! :start-after: example-begin max-env-determinism
1476
+ //! :end-before: example-end max-env-determinism
1477
+ //!
1478
+ //! @endrst
1479
+ //!
1480
+ //! @tparam InputIteratorT
1481
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1482
+ //!
1483
+ //! @tparam OutputIteratorT
1484
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1485
+ //!
1486
+ //! @tparam NumItemsT
1487
+ //! **[inferred]** Type of num_items
1488
+ //!
1489
+ //! @tparam EnvT
1490
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
1491
+ //!
1492
+ //! @param[in] d_in
1493
+ //! Pointer to the input sequence of data items
1494
+ //!
1495
+ //! @param[out] d_out
1496
+ //! Pointer to the output aggregate
1497
+ //!
1498
+ //! @param[in] num_items
1499
+ //! Total number of input items (i.e., length of ``d_in``)
1500
+ //!
1501
+ //! @param[in] env
1502
+ //! @rst
1503
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1504
+ //! @endrst
1505
+ template <typename InputIteratorT,
1506
+ typename OutputIteratorT,
1507
+ typename NumItemsT,
1508
+ typename EnvT = ::cuda::std::execution::env<>>
1509
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1510
+ Max(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
1511
+ {
1512
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Max");
1513
+
1514
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
1515
+ "Determinism should be used inside requires to have an effect.");
1516
+ using requirements_t = ::cuda::std::execution::
1517
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
1518
+ using requested_determinism_t =
1519
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
1520
+ ::cuda::execution::determinism::__get_determinism_t,
1521
+ ::cuda::execution::determinism::run_to_run_t>;
1522
+
1523
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1524
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1525
+ "gpu_to_gpu determinism is not supported");
1526
+
1527
+ // TODO(NaderAlAwar): Relax this once non-deterministic implementation for min / max is available
1528
+ using determinism_t = ::cuda::execution::determinism::run_to_run_t;
1529
+
1530
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
1531
+
1532
+ using InitT = OutputT;
1533
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1534
+
1535
+ // Dispatch with environment - handles all boilerplate
1536
+ return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
1537
+ using tuning_t = decltype(tuning);
1538
+ return reduce_impl<tuning_t>(
1539
+ storage,
1540
+ bytes,
1541
+ d_in,
1542
+ d_out,
1543
+ num_items,
1544
+ ::cuda::maximum<>{},
1545
+ ::cuda::std::identity{},
1546
+ limits_t::lowest(),
1547
+ determinism_t{},
1548
+ stream);
1549
+ });
1550
+ }
1551
+
1552
+ //! @rst
1553
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1554
+ //! item.
1555
+ //!
1556
+ //! - The maximum is written to ``d_max_out``
1557
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1558
+ //! ``cuda::std::int64_t``.
1559
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out`` and the index
1560
+ //! ``1`` is written to ``d_index_out``.
1561
+ //! - Does not support ``>`` operators that are non-commutative.
1562
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1563
+ //! (e.g., addition of floating point types) on the same GPU device.
1564
+ //! However, results for pseudo-associative reduction may be inconsistent
1565
+ //! from one device to a another device of a different compute-capability
1566
+ //! because CUB can employ different tile-sizing for different architectures.
1567
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1568
+ //! - @devicestorage
1569
+ //!
1570
+ //! Snippet
1571
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1572
+ //!
1573
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1574
+ //! of `int` data elements.
1575
+ //!
1576
+ //! .. code-block:: c++
1577
+ //!
1578
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1579
+ //! #include <cuda/std/cstdint>
1580
+ //!
1581
+ //! // Declare, allocate, and initialize device-accessible pointers
1582
+ //! // for input and output
1583
+ //! int num_items; // e.g., 7
1584
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1585
+ //! int *d_max_out; // memory for the maximum value
1586
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1587
+ //! ...
1588
+ //!
1589
+ //! // Determine temporary device storage requirements
1590
+ //! void *d_temp_storage = nullptr;
1591
+ //! size_t temp_storage_bytes = 0;
1592
+ //! cub::DeviceReduce::ArgMax(
1593
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1594
+ //!
1595
+ //! // Allocate temporary storage
1596
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1597
+ //!
1598
+ //! // Run argmax-reduction
1599
+ //! cub::DeviceReduce::ArgMax(
1600
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1601
+ //!
1602
+ //! // d_max_out <-- 9
1603
+ //! // d_index_out <-- 6
1604
+ //!
1605
+ //! @endrst
1606
+ //!
1607
+ //! @tparam InputIteratorT
1608
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1609
+ //!
1610
+ //! @tparam ExtremumOutIteratorT
1611
+ //! **[inferred]** Output iterator type for recording maximum value
1612
+ //!
1613
+ //! @tparam IndexOutIteratorT
1614
+ //! **[inferred]** Output iterator type for recording index of the returned value
1615
+ //!
1616
+ //! @param[in] d_temp_storage
1617
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1618
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1619
+ //!
1620
+ //! @param[in,out] temp_storage_bytes
1621
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1622
+ //!
1623
+ //! @param[in] d_in
1624
+ //! Pointer to the input sequence of data items
1625
+ //!
1626
+ //! @param[out] d_max_out
1627
+ //! Iterator to which the maximum value is written
1628
+ //!
1629
+ //! @param[out] d_index_out
1630
+ //! Iterator to which the index of the returned value is written
1631
+ //!
1632
+ //! @param[in] num_items
1633
+ //! Total number of input items (i.e., length of ``d_in``)
1634
+ //!
1635
+ //! @param[in] stream
1636
+ //! @rst
1637
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1638
+ //! @endrst
1639
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1640
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
1641
+ void* d_temp_storage,
1642
+ size_t& temp_storage_bytes,
1643
+ InputIteratorT d_in,
1644
+ ExtremumOutIteratorT d_max_out,
1645
+ IndexOutIteratorT d_index_out,
1646
+ ::cuda::std::int64_t num_items,
1647
+ cudaStream_t stream = 0)
1648
+ {
1649
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1650
+
1651
+ // The input type
1652
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1653
+
1654
+ // Offset type used within the kernel and to index within one partition
1655
+ using PerPartitionOffsetT = int;
1656
+
1657
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1658
+ using GlobalOffsetT = ::cuda::std::int64_t;
1659
+
1660
+ // The value type used for the extremum
1661
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1662
+ using InitT = OutputExtremumT;
1663
+
1664
+ // Reduction operation
1665
+ using ReduceOpT = cub::ArgMax;
1666
+
1667
+ // Initial value
1668
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1669
+
1670
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1671
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1672
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1673
+
1674
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1675
+ InputIteratorT,
1676
+ decltype(out_it),
1677
+ PerPartitionOffsetT,
1678
+ GlobalOffsetT,
1679
+ ReduceOpT,
1680
+ InitT>::Dispatch(d_temp_storage,
1681
+ temp_storage_bytes,
1682
+ d_in,
1683
+ out_it,
1684
+ static_cast<GlobalOffsetT>(num_items),
1685
+ ReduceOpT{},
1686
+ initial_value,
1687
+ stream);
1688
+ }
1689
+
1690
+ //! @rst
1691
+ //! Finds the first device-wide maximum using the greater-than (``>``)
1692
+ //! operator, also returning the index of that item
1693
+ //!
1694
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1695
+ //! (assuming the value type of ``d_in`` is ``T``)
1696
+ //!
1697
+ //! - The maximum is written to ``d_out.value`` and its offset in the input
1698
+ //! array is written to ``d_out.key``.
1699
+ //! - The ``{1, cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
1700
+ //!
1701
+ //! - Does not support ``>`` operators that are non-commutative.
1702
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1703
+ //! (e.g., addition of floating point types) on the same GPU device.
1704
+ //! However, results for pseudo-associative reduction may be inconsistent
1705
+ //! from one device to a another device of a different compute-capability
1706
+ //! because CUB can employ different tile-sizing for different architectures.
1707
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1708
+ //! - @devicestorage
1709
+ //!
1710
+ //! Snippet
1711
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1712
+ //!
1713
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1714
+ //! of `int` data elements.
1715
+ //!
1716
+ //! .. code-block:: c++
1717
+ //!
1718
+ //! #include <cub/cub.cuh>
1719
+ //! // or equivalently <cub/device/device_reduce.cuh>
1720
+ //!
1721
+ //! // Declare, allocate, and initialize device-accessible pointers
1722
+ //! // for input and output
1723
+ //! int num_items; // e.g., 7
1724
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1725
+ //! KeyValuePair<int, int> *d_argmax; // e.g., [{-,-}]
1726
+ //! ...
1727
+ //!
1728
+ //! // Determine temporary device storage requirements
1729
+ //! void *d_temp_storage = nullptr;
1730
+ //! size_t temp_storage_bytes = 0;
1731
+ //! cub::DeviceReduce::ArgMax(
1732
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1733
+ //!
1734
+ //! // Allocate temporary storage
1735
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1736
+ //!
1737
+ //! // Run argmax-reduction
1738
+ //! cub::DeviceReduce::ArgMax(
1739
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1740
+ //!
1741
+ //! // d_argmax <-- [{6, 9}]
1742
+ //!
1743
+ //! @endrst
1744
+ //!
1745
+ //! @tparam InputIteratorT
1746
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1747
+ //!
1748
+ //! @tparam OutputIteratorT
1749
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1750
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1751
+ //!
1752
+ //! @param[in] d_temp_storage
1753
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1754
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1755
+ //!
1756
+ //! @param[in,out] temp_storage_bytes
1757
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1758
+ //!
1759
+ //! @param[in] d_in
1760
+ //! Pointer to the input sequence of data items
1761
+ //!
1762
+ //! @param[out] d_out
1763
+ //! Pointer to the output aggregate
1764
+ //!
1765
+ //! @param[in] num_items
1766
+ //! Total number of input items (i.e., length of ``d_in``)
1767
+ //!
1768
+ //! @param[in] stream
1769
+ //! @rst
1770
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1771
+ //! @endrst
1772
+ template <typename InputIteratorT, typename OutputIteratorT>
1773
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface that takes two separate "
1774
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1775
+ "index of the found extremum is written. ") CUB_RUNTIME_FUNCTION static cudaError_t
1776
+ ArgMax(void* d_temp_storage,
1777
+ size_t& temp_storage_bytes,
1778
+ InputIteratorT d_in,
1779
+ OutputIteratorT d_out,
1780
+ int num_items,
1781
+ cudaStream_t stream = 0)
1782
+ {
1783
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1784
+
1785
+ // Signed integer type for global offsets
1786
+ using OffsetT = int;
1787
+
1788
+ // The input type
1789
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1790
+
1791
+ // The output tuple type
1792
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1793
+
1794
+ using AccumT = OutputTupleT;
1795
+
1796
+ // The output value type
1797
+ using OutputValueT = typename OutputTupleT::Value;
1798
+
1799
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1800
+
1801
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1802
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1803
+
1804
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1805
+
1806
+ // Initial value
1807
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
1808
+
1809
+ return detail::reduce::dispatch<AccumT>(
1810
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, OffsetT{num_items}, cub::ArgMax(), initial_value, stream);
1811
+ }
1812
+
1813
+ //! @rst
1814
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1815
+ //! item.
1816
+ //!
1817
+ //! - The maximum is written to ``d_max_out``
1818
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1819
+ //! ``cuda::std::int64_t``.
1820
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::lowest()}`` is written to ``d_max_out`` and the index
1821
+ //! ``1`` is written to ``d_index_out``.
1822
+ //! - Does not support ``>`` operators that are non-commutative.
1823
+ //! - Provides determinism based on the environment's determinism requirements.
1824
+ //! To request "run-to-run" determinism, pass ``cuda::execution::require(cuda::execution::determinism::run_to_run)``
1825
+ //! as the `env` parameter.
1826
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_max_out`` nor ``d_index_out``.
1827
+ //!
1828
+ //! Snippet
1829
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1830
+ //!
1831
+ //! The code snippet below illustrates the argmax-reduction of a device vector of ``int`` data elements.
1832
+ //!
1833
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
1834
+ //! :language: c++
1835
+ //! :dedent:
1836
+ //! :start-after: example-begin argmax-env-determinism
1837
+ //! :end-before: example-end argmax-env-determinism
1838
+ //!
1839
+ //! @endrst
1840
+ //!
1841
+ //! @tparam InputIteratorT
1842
+ //! **[inferred]** Random-access input iterator type for reading input items
1843
+ //! (of some type `T`) @iterator
1844
+ //!
1845
+ //! @tparam ExtremumOutIteratorT
1846
+ //! **[inferred]** Output iterator type for recording maximum value
1847
+ //!
1848
+ //! @tparam IndexOutIteratorT
1849
+ //! **[inferred]** Output iterator type for recording index of the returned value
1850
+ //!
1851
+ //! @tparam EnvT
1852
+ //! **[inferred]** Execution environment type. Default is ``cuda::std::execution::env<>``.
1853
+ //!
1854
+ //! @param[in] d_in
1855
+ //! Iterator to the input sequence of data items
1856
+ //!
1857
+ //! @param[out] d_max_out
1858
+ //! Iterator to which the maximum value is written
1859
+ //!
1860
+ //! @param[out] d_index_out
1861
+ //! Iterator to which the index of the returned value is written
1862
+ //!
1863
+ //! @param[in] num_items
1864
+ //! Total number of input items (i.e., length of ``d_in``)
1865
+ //!
1866
+ //! @param[in] env
1867
+ //! @rst
1868
+ //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1869
+ //! @endrst
1870
+ template <typename InputIteratorT,
1871
+ typename ExtremumOutIteratorT,
1872
+ typename IndexOutIteratorT,
1873
+ typename EnvT = ::cuda::std::execution::env<>>
1874
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1875
+ ArgMax(InputIteratorT d_in,
1876
+ ExtremumOutIteratorT d_max_out,
1877
+ IndexOutIteratorT d_index_out,
1878
+ ::cuda::std::int64_t num_items,
1879
+ EnvT env = {})
1880
+ {
1881
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::ArgMax");
1882
+
1883
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
1884
+ "Determinism should be used inside requires to have an effect.");
1885
+ using requirements_t = ::cuda::std::execution::
1886
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
1887
+ using requested_determinism_t =
1888
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
1889
+ ::cuda::execution::determinism::__get_determinism_t,
1890
+ ::cuda::execution::determinism::run_to_run_t>;
1891
+
1892
+ // Static assert to reject gpu_to_gpu determinism since it's not properly implemented
1893
+ static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
1894
+ "gpu_to_gpu determinism is not supported");
1895
+
1896
+ // Query relevant properties from the environment
1897
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
1898
+ auto mr =
1899
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
1900
+
1901
+ void* d_temp_storage = nullptr;
1902
+ size_t temp_storage_bytes = 0;
1903
+
1904
+ using tuning_t =
1905
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
1906
+
1907
+ // Reduction operation
1908
+ using ReduceOpT = cub::ArgMax;
1909
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1910
+ using PerPartitionOffsetT = int;
1911
+ using GlobalOffsetT = ::cuda::std::int64_t;
1912
+
1913
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1914
+ using InitT = OutputExtremumT;
1915
+
1916
+ // Initial value
1917
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1918
+
1919
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1920
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1921
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1922
+
1923
+ // Query the required temporary storage size
1924
+ cudaError_t error = detail::reduce::dispatch_streaming_arg_reduce_t<
1925
+ InputIteratorT,
1926
+ decltype(out_it),
1927
+ PerPartitionOffsetT,
1928
+ GlobalOffsetT,
1929
+ ReduceOpT,
1930
+ InitT>::Dispatch(d_temp_storage,
1931
+ temp_storage_bytes,
1932
+ d_in,
1933
+ out_it,
1934
+ static_cast<GlobalOffsetT>(num_items),
1935
+ ReduceOpT{},
1936
+ initial_value,
1937
+ stream.get());
1938
+ if (error != cudaSuccess)
1939
+ {
1940
+ return error;
1941
+ }
1942
+
1943
+ // TODO(gevtushenko): use uninitialized buffer when it's available
1944
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
1945
+ if (error != cudaSuccess)
1946
+ {
1947
+ return error;
1948
+ }
1949
+
1950
+ // Run the algorithm
1951
+ error = detail::reduce::dispatch_streaming_arg_reduce_t<
1952
+ InputIteratorT,
1953
+ decltype(out_it),
1954
+ PerPartitionOffsetT,
1955
+ GlobalOffsetT,
1956
+ ReduceOpT,
1957
+ InitT>::Dispatch(d_temp_storage,
1958
+ temp_storage_bytes,
1959
+ d_in,
1960
+ out_it,
1961
+ static_cast<GlobalOffsetT>(num_items),
1962
+ ReduceOpT{},
1963
+ initial_value,
1964
+ stream.get());
1965
+
1966
+ // Try to deallocate regardless of the error to avoid memory leaks
1967
+ cudaError_t deallocate_error =
1968
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
1969
+
1970
+ if (error != cudaSuccess)
1971
+ {
1972
+ // Reduction error takes precedence over deallocation error since it happens first
1973
+ return error;
1974
+ }
1975
+
1976
+ return deallocate_error;
1977
+ }
1978
+
1979
+ //! @rst
1980
+ //! Fuses transform and reduce operations
1981
+ //!
1982
+ //! - Does not support binary reduction operators that are non-commutative.
1983
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1984
+ //! (e.g., addition of floating point types) on the same GPU device.
1985
+ //! However, results for pseudo-associative reduction may be inconsistent
1986
+ //! from one device to a another device of a different compute-capability
1987
+ //! because CUB can employ different tile-sizing for different architectures.
1988
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1989
+ //! - @devicestorage
1990
+ //!
1991
+ //! Snippet
1992
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1993
+ //!
1994
+ //! The code snippet below illustrates a user-defined min-reduction of a
1995
+ //! device vector of `int` data elements.
1996
+ //!
1997
+ //! .. code-block:: c++
1998
+ //!
1999
+ //! #include <cub/cub.cuh>
2000
+ //! // or equivalently <cub/device/device_reduce.cuh>
2001
+ //!
2002
+ //! thrust::device_vector<int> in = { 1, 2, 3, 4 };
2003
+ //! thrust::device_vector<int> out(1);
2004
+ //!
2005
+ //! size_t temp_storage_bytes = 0;
2006
+ //! uint8_t *d_temp_storage = nullptr;
2007
+ //!
2008
+ //! const int init = 42;
2009
+ //!
2010
+ //! cub::DeviceReduce::TransformReduce(
2011
+ //! d_temp_storage,
2012
+ //! temp_storage_bytes,
2013
+ //! in.begin(),
2014
+ //! out.begin(),
2015
+ //! in.size(),
2016
+ //! cuda::std::plus<>{},
2017
+ //! square_t{},
2018
+ //! init);
2019
+ //!
2020
+ //! thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
2021
+ //! d_temp_storage = temp_storage.data().get();
2022
+ //!
2023
+ //! cub::DeviceReduce::TransformReduce(
2024
+ //! d_temp_storage,
2025
+ //! temp_storage_bytes,
2026
+ //! in.begin(),
2027
+ //! out.begin(),
2028
+ //! in.size(),
2029
+ //! cuda::std::plus<>{},
2030
+ //! square_t{},
2031
+ //! init);
2032
+ //!
2033
+ //! // out[0] <-- 72
2034
+ //!
2035
+ //! @endrst
2036
+ //!
2037
+ //! @tparam InputIteratorT
2038
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
2039
+ //!
2040
+ //! @tparam OutputIteratorT
2041
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
2042
+ //!
2043
+ //! @tparam ReductionOpT
2044
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2045
+ //!
2046
+ //! @tparam TransformOpT
2047
+ //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
2048
+ //!
2049
+ //! @tparam T
2050
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
2051
+ //!
2052
+ //! @tparam NumItemsT
2053
+ //! **[inferred]** Type of num_items
2054
+ //!
2055
+ //! @param[in] d_temp_storage
2056
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2057
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2058
+ //!
2059
+ //! @param[in,out] temp_storage_bytes
2060
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2061
+ //!
2062
+ //! @param[in] d_in
2063
+ //! Pointer to the input sequence of data items
2064
+ //!
2065
+ //! @param[out] d_out
2066
+ //! Pointer to the output aggregate
2067
+ //!
2068
+ //! @param[in] num_items
2069
+ //! Total number of input items (i.e., length of ``d_in``)
2070
+ //!
2071
+ //! @param[in] reduction_op
2072
+ //! Binary reduction functor
2073
+ //!
2074
+ //! @param[in] transform_op
2075
+ //! Unary transform functor
2076
+ //!
2077
+ //! @param[in] init
2078
+ //! Initial value of the reduction
2079
+ //!
2080
+ //! @param[in] stream
2081
+ //! @rst
2082
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2083
+ //! @endrst
2084
+ template <typename InputIteratorT,
2085
+ typename OutputIteratorT,
2086
+ typename ReductionOpT,
2087
+ typename TransformOpT,
2088
+ typename T,
2089
+ typename NumItemsT>
2090
+ CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
2091
+ void* d_temp_storage,
2092
+ size_t& temp_storage_bytes,
2093
+ InputIteratorT d_in,
2094
+ OutputIteratorT d_out,
2095
+ NumItemsT num_items,
2096
+ ReductionOpT reduction_op,
2097
+ TransformOpT transform_op,
2098
+ T init,
2099
+ cudaStream_t stream = 0)
2100
+ {
2101
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
2102
+
2103
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2104
+
2105
+ return detail::reduce::dispatch(
2106
+ d_temp_storage,
2107
+ temp_storage_bytes,
2108
+ d_in,
2109
+ d_out,
2110
+ static_cast<OffsetT>(num_items),
2111
+ reduction_op,
2112
+ init,
2113
+ stream,
2114
+ transform_op);
2115
+ }
2116
+
2117
+ //! @rst
2118
+ //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
2119
+ //!
2120
+ //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
2121
+ //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
2122
+ //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
2123
+ //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
2124
+ //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
2125
+ //!
2126
+ //! - The ``==`` equality operator is used to determine whether keys are equivalent
2127
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
2128
+ //! (e.g., addition of floating point types) on the same GPU device.
2129
+ //! However, results for pseudo-associative reduction may be inconsistent
2130
+ //! from one device to a another device of a different compute-capability
2131
+ //! because CUB can employ different tile-sizing for different architectures.
2132
+ //! - Let ``out`` be any of
2133
+ //! ``[d_unique_out, d_unique_out + *d_num_runs_out)``
2134
+ //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
2135
+ //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
2136
+ //! ``[d_keys_in, d_keys_in + num_items)``,
2137
+ //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
2138
+ //! - @devicestorage
2139
+ //!
2140
+ //! Snippet
2141
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2142
+ //!
2143
+ //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
2144
+ //! associated ``int`` keys.
2145
+ //!
2146
+ //! .. code-block:: c++
2147
+ //!
2148
+ //! #include <cub/cub.cuh>
2149
+ //! // or equivalently <cub/device/device_reduce.cuh>
2150
+ //!
2151
+ //! // CustomMin functor
2152
+ //! struct CustomMin
2153
+ //! {
2154
+ //! template <typename T>
2155
+ //! __device__ __forceinline__
2156
+ //! T operator()(const T &a, const T &b) const {
2157
+ //! return (b < a) ? b : a;
2158
+ //! }
2159
+ //! };
2160
+ //!
2161
+ //! // Declare, allocate, and initialize device-accessible pointers
2162
+ //! // for input and output
2163
+ //! int num_items; // e.g., 8
2164
+ //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
2165
+ //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
2166
+ //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
2167
+ //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
2168
+ //! int *d_num_runs_out; // e.g., [-]
2169
+ //! CustomMin reduction_op;
2170
+ //! ...
2171
+ //!
2172
+ //! // Determine temporary device storage requirements
2173
+ //! void *d_temp_storage = nullptr;
2174
+ //! size_t temp_storage_bytes = 0;
2175
+ //! cub::DeviceReduce::ReduceByKey(
2176
+ //! d_temp_storage, temp_storage_bytes,
2177
+ //! d_keys_in, d_unique_out, d_values_in,
2178
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2179
+ //!
2180
+ //! // Allocate temporary storage
2181
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2182
+ //!
2183
+ //! // Run reduce-by-key
2184
+ //! cub::DeviceReduce::ReduceByKey(
2185
+ //! d_temp_storage, temp_storage_bytes,
2186
+ //! d_keys_in, d_unique_out, d_values_in,
2187
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
2188
+ //!
2189
+ //! // d_unique_out <-- [0, 2, 9, 5, 8]
2190
+ //! // d_aggregates_out <-- [0, 1, 6, 2, 4]
2191
+ //! // d_num_runs_out <-- [5]
2192
+ //!
2193
+ //! @endrst
2194
+ //!
2195
+ //! @tparam KeysInputIteratorT
2196
+ //! **[inferred]** Random-access input iterator type for reading input keys @iterator
2197
+ //!
2198
+ //! @tparam UniqueOutputIteratorT
2199
+ //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator
2200
+ //!
2201
+ //! @tparam ValuesInputIteratorT
2202
+ //! **[inferred]** Random-access input iterator type for reading input values @iterator
2203
+ //!
2204
+ //! @tparam AggregatesOutputIterator
2205
+ //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
2206
+ //!
2207
+ //! @tparam NumRunsOutputIteratorT
2208
+ //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator
2209
+ //!
2210
+ //! @tparam ReductionOpT
2211
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
2212
+ //!
2213
+ //! @tparam NumItemsT
2214
+ //! **[inferred]** Type of num_items
2215
+ //!
2216
+ //! @param[in] d_temp_storage
2217
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2218
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2219
+ //!
2220
+ //! @param[in,out] temp_storage_bytes
2221
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2222
+ //!
2223
+ //! @param[in] d_keys_in
2224
+ //! Pointer to the input sequence of keys
2225
+ //!
2226
+ //! @param[out] d_unique_out
2227
+ //! Pointer to the output sequence of unique keys (one key per run)
2228
+ //!
2229
+ //! @param[in] d_values_in
2230
+ //! Pointer to the input sequence of corresponding values
2231
+ //!
2232
+ //! @param[out] d_aggregates_out
2233
+ //! Pointer to the output sequence of value aggregates
2234
+ //! (one aggregate per run)
2235
+ //!
2236
+ //! @param[out] d_num_runs_out
2237
+ //! Pointer to total number of runs encountered
2238
+ //! (i.e., the length of ``d_unique_out``)
2239
+ //!
2240
+ //! @param[in] reduction_op
2241
+ //! Binary reduction functor
2242
+ //!
2243
+ //! @param[in] num_items
2244
+ //! Total number of associated key+value pairs
2245
+ //! (i.e., the length of ``d_in_keys`` and ``d_in_values``)
2246
+ //!
2247
+ //! @param[in] stream
2248
+ //! @rst
2249
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2250
+ //! @endrst
2251
+ template <typename KeysInputIteratorT,
2252
+ typename UniqueOutputIteratorT,
2253
+ typename ValuesInputIteratorT,
2254
+ typename AggregatesOutputIteratorT,
2255
+ typename NumRunsOutputIteratorT,
2256
+ typename ReductionOpT,
2257
+ typename NumItemsT>
2258
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
2259
+ void* d_temp_storage,
2260
+ size_t& temp_storage_bytes,
2261
+ KeysInputIteratorT d_keys_in,
2262
+ UniqueOutputIteratorT d_unique_out,
2263
+ ValuesInputIteratorT d_values_in,
2264
+ AggregatesOutputIteratorT d_aggregates_out,
2265
+ NumRunsOutputIteratorT d_num_runs_out,
2266
+ ReductionOpT reduction_op,
2267
+ NumItemsT num_items,
2268
+ cudaStream_t stream = 0)
2269
+ {
2270
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
2271
+
2272
+ // Signed integer type for global offsets
2273
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2274
+
2275
+ // FlagT iterator type (not used)
2276
+
2277
+ // Selection op (not used)
2278
+
2279
+ // Default == operator
2280
+ using EqualityOp = ::cuda::std::equal_to<>;
2281
+
2282
+ return DispatchReduceByKey<
2283
+ KeysInputIteratorT,
2284
+ UniqueOutputIteratorT,
2285
+ ValuesInputIteratorT,
2286
+ AggregatesOutputIteratorT,
2287
+ NumRunsOutputIteratorT,
2288
+ EqualityOp,
2289
+ ReductionOpT,
2290
+ OffsetT>::Dispatch(d_temp_storage,
2291
+ temp_storage_bytes,
2292
+ d_keys_in,
2293
+ d_unique_out,
2294
+ d_values_in,
2295
+ d_aggregates_out,
2296
+ d_num_runs_out,
2297
+ EqualityOp(),
2298
+ reduction_op,
2299
+ static_cast<OffsetT>(num_items),
2300
+ stream);
2301
+ }
2302
+ };
2303
+ CUB_NAMESPACE_END