cuda-cccl 0.1.3.2.0.dev271__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1947) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +46 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +273 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +226 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +632 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1114 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  43. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  44. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1342 -0
  45. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  46. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  47. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  48. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  49. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  50. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  51. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  53. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  54. cuda/cccl/headers/include/cub/block/block_reduce.cuh +665 -0
  55. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  56. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  57. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  58. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  59. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  67. cuda/cccl/headers/include/cub/config.cuh +53 -0
  68. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  69. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  70. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  71. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  72. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  73. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  74. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +84 -0
  75. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  76. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  77. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  85. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  86. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  87. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  88. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  89. cuda/cccl/headers/include/cub/detail/type_traits.cuh +179 -0
  90. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  91. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  92. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  93. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  94. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  95. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  96. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  97. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  98. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  99. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  100. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  101. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1898 -0
  102. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  103. cuda/cccl/headers/include/cub/device/device_scan.cuh +1899 -0
  104. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  105. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  107. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  108. cuda/cccl/headers/include/cub/device/device_transform.cuh +545 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1042 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1749 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +656 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +612 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +916 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +455 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +558 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +591 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +121 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +987 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +609 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +448 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  160. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +226 -0
  161. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  162. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  163. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +260 -0
  165. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  166. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  167. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  168. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +684 -0
  169. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +547 -0
  170. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  171. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  172. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +464 -0
  173. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  174. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  175. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  176. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  177. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  178. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  179. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  180. cuda/cccl/headers/include/cub/util_macro.cuh +99 -0
  181. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  182. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  183. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  184. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  185. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  186. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  187. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  188. cuda/cccl/headers/include/cub/version.cuh +89 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +950 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +713 -0
  195. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  196. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  197. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  198. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  199. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1885 -0
  200. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  201. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/copy.h +143 -0
  204. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  212. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  213. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +466 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  218. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  220. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  221. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  222. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  223. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  225. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  226. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  227. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  228. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  229. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  230. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  232. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  235. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  236. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  237. cuda/cccl/headers/include/cuda/__device/device_ref.h +176 -0
  238. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  239. cuda/cccl/headers/include/cuda/__driver/driver_api.h +503 -0
  240. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  241. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  242. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  243. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  244. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  245. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  246. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  247. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  248. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  249. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  250. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +109 -0
  251. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  253. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  254. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  255. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  256. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  257. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  258. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  259. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +424 -0
  260. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +292 -0
  261. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  262. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +335 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +501 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +496 -0
  265. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +452 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +94 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +539 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  285. cuda/cccl/headers/include/cuda/__memory/address_space.h +211 -0
  286. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  289. cuda/cccl/headers/include/cuda/__memory/check_address.h +106 -0
  290. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  291. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  292. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  293. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  294. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  295. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  299. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  300. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  301. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  302. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  303. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  304. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  412. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  413. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  414. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  415. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  416. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  417. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  418. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  419. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  420. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  421. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  422. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  423. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  424. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  425. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +521 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +78 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  443. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  444. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  445. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  446. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  447. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  448. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  449. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  450. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  451. cuda/cccl/headers/include/cuda/access_property +26 -0
  452. cuda/cccl/headers/include/cuda/algorithm +27 -0
  453. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  454. cuda/cccl/headers/include/cuda/atomic +27 -0
  455. cuda/cccl/headers/include/cuda/barrier +267 -0
  456. cuda/cccl/headers/include/cuda/bit +29 -0
  457. cuda/cccl/headers/include/cuda/cmath +36 -0
  458. cuda/cccl/headers/include/cuda/devices +20 -0
  459. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  460. cuda/cccl/headers/include/cuda/functional +32 -0
  461. cuda/cccl/headers/include/cuda/iterator +38 -0
  462. cuda/cccl/headers/include/cuda/latch +27 -0
  463. cuda/cccl/headers/include/cuda/mdspan +28 -0
  464. cuda/cccl/headers/include/cuda/memory +34 -0
  465. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  466. cuda/cccl/headers/include/cuda/numeric +29 -0
  467. cuda/cccl/headers/include/cuda/pipeline +578 -0
  468. cuda/cccl/headers/include/cuda/ptx +128 -0
  469. cuda/cccl/headers/include/cuda/semaphore +31 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  566. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  567. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  568. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  569. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  570. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  588. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  589. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  590. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  591. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  592. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  593. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  594. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  595. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  601. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  602. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  603. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  604. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  605. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +146 -0
  626. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  627. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  628. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  629. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  630. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  631. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  632. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  633. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  634. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  635. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  636. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  637. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  638. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  639. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  640. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  641. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  642. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  643. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  644. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  645. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  646. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  647. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  648. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  649. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  650. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  656. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  657. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  658. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  659. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  660. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  661. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  662. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  663. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  664. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  665. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  666. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  667. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  668. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  669. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  670. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  671. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  672. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  673. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  674. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  675. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  676. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  677. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  678. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  679. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  680. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  681. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  682. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  683. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  684. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  685. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  686. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  687. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  689. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  690. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  691. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  692. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  694. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  695. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  696. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  697. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  698. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  699. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  700. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +58 -0
  701. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  702. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  703. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  704. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  705. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  706. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  707. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  708. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  709. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1963 -0
  710. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  711. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  712. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  713. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  714. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  715. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  716. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  717. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +374 -0
  718. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  719. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  721. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  722. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +72 -0
  723. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  724. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  725. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  726. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  727. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  728. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  729. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  730. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  731. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  732. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  733. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  734. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  735. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  736. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  737. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  738. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  739. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  740. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  741. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  742. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  743. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  744. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  745. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  746. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  747. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  748. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  749. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  750. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  751. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  752. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  753. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  754. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  755. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  756. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  757. cuda/cccl/headers/include/cuda/std/__functional/function.h +1279 -0
  758. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  759. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  760. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  761. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  762. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  763. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  764. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  765. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  766. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  767. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  768. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  769. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  775. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  776. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  777. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  778. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  779. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  780. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  781. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  782. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  783. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  784. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  785. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  786. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  787. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  788. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  789. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  790. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  791. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  792. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  793. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  794. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  795. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  796. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  797. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  798. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  799. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  800. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  801. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  802. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  803. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  804. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  805. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  808. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  809. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +150 -0
  810. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  811. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  812. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  813. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  814. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  815. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  816. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  817. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  818. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  819. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +433 -0
  820. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  834. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  835. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  836. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  837. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  838. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  839. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  840. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  841. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  842. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  843. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +138 -0
  844. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  846. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  847. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  848. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  849. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  850. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +499 -0
  851. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  852. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  853. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  854. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  855. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  856. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  857. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  858. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  859. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  860. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  861. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +552 -0
  862. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  863. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  864. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  865. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  866. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  867. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  868. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  869. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  870. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  871. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +682 -0
  872. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +767 -0
  873. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  874. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  875. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  876. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  877. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  878. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  879. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  880. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  881. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  882. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  883. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  884. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  885. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  886. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  887. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  888. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  889. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  890. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  891. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  892. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  893. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  894. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  895. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  896. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  897. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  898. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  899. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  900. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  901. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  902. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  903. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  904. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  905. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  906. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  907. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  908. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  909. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  910. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  911. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  912. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  913. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  914. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  915. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  916. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  917. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  918. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  919. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  920. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  921. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  922. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  923. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  924. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  925. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  926. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  927. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  928. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  929. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  930. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  935. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  936. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  937. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  938. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  939. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  940. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  941. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  942. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  943. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  944. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  945. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  946. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  947. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  948. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  949. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  950. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  951. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  952. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  953. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  954. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  955. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  956. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  957. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  958. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  959. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +291 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1100. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1101. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1102. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1103. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1104. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1105. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1106. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1107. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1108. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1109. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1110. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1111. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1112. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1113. cuda/cccl/headers/include/cuda/std/__utility/pair.h +797 -0
  1114. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1115. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1116. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1117. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1118. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1119. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1120. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1121. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1122. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1123. cuda/cccl/headers/include/cuda/std/array +518 -0
  1124. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1125. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1126. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1127. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1128. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1129. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1130. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1131. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1132. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1133. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1134. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1135. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1136. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1137. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1138. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1139. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1140. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1141. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1142. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1143. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1722 -0
  1144. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3630 -0
  1145. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +520 -0
  1146. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1147. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1148. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1149. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1150. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1151. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1152. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1153. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1154. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1155. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1156. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1157. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1158. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1159. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1160. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1161. cuda/cccl/headers/include/cuda/std/numbers +342 -0
  1162. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1163. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1164. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1165. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1166. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1167. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1168. cuda/cccl/headers/include/cuda/std/span +628 -0
  1169. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1170. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1171. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1172. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1173. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1174. cuda/cccl/headers/include/cuda/std/version +245 -0
  1175. cuda/cccl/headers/include/cuda/stream +31 -0
  1176. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1177. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1178. cuda/cccl/headers/include/cuda/utility +27 -0
  1179. cuda/cccl/headers/include/cuda/version +16 -0
  1180. cuda/cccl/headers/include/cuda/warp +28 -0
  1181. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1182. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1183. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1184. cuda/cccl/headers/include/nv/target +235 -0
  1185. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1186. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1187. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1188. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1189. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1190. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1191. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1192. cuda/cccl/headers/include/thrust/count.h +245 -0
  1193. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1194. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1195. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1196. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1197. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1198. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1199. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1200. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1201. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1202. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1203. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1204. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1205. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1206. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1207. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1208. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1209. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1210. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1211. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1212. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1213. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1214. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1215. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1216. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1217. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1218. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1219. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1220. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1221. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1222. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1223. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1224. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1225. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1226. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1227. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1228. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1229. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1230. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1237. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1238. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1239. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1240. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1241. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1242. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1243. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1244. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1245. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1246. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1247. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1248. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1249. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1250. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1251. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1252. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1253. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1254. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1255. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1256. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1257. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1258. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1259. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1260. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1261. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1262. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1263. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1264. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1265. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1266. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1267. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1268. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1269. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1270. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1271. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1272. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1273. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1274. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1275. cuda/cccl/headers/include/thrust/detail/internal_functional.h +293 -0
  1276. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1277. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1278. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1279. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1280. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1281. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1282. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1283. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1284. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1285. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1286. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1287. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1288. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1289. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1290. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1291. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1292. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1293. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1294. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1295. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1296. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1297. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1298. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1299. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1300. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1301. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1302. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1303. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1304. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1305. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1306. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1307. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1308. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1309. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1310. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1311. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1312. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1313. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1314. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1315. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1316. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1317. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1318. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1320. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1321. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1322. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1324. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1325. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1330. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1331. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1332. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1333. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1334. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1335. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1336. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1337. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1338. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1339. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1340. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1341. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1342. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1343. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1344. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1345. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1346. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1347. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1348. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1349. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1350. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1351. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1352. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1353. cuda/cccl/headers/include/thrust/find.h +382 -0
  1354. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1355. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1356. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1357. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1358. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1359. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1360. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1361. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1362. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1363. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1364. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1365. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1366. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1367. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1377. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1378. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1379. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1380. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1381. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +311 -0
  1382. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1383. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1384. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1385. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1386. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1387. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1388. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1389. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1390. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1391. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1392. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1393. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1394. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1395. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1396. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1397. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1398. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1399. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1400. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1401. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1402. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1403. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1404. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1405. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1406. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1407. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1408. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1409. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1410. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1411. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1412. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1413. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1414. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1415. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1416. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1417. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1418. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1419. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1420. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1421. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1430. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1431. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1432. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1433. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1434. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1435. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1436. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1437. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1438. cuda/cccl/headers/include/thrust/random.h +120 -0
  1439. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1440. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1441. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1442. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1443. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1444. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1445. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1446. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1447. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1448. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1449. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1450. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1451. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1452. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1453. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1454. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1455. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +158 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1501. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1502. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1503. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1504. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +609 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +92 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +782 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1738 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +415 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +92 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +73 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1755. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +157 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1822. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +157 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +54 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1912. cuda/cccl/parallel/experimental/__init__.py +75 -0
  1913. cuda/cccl/parallel/experimental/_bindings.py +56 -0
  1914. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1915. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1957 -0
  1916. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1917. cuda/cccl/parallel/experimental/_cccl_interop.py +396 -0
  1918. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1919. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1920. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1921. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1922. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1923. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1924. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1925. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1926. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1927. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1928. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1929. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1930. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1931. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1932. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1933. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1934. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1935. cuda/cccl/parallel/experimental/iterators/__init__.py +21 -0
  1936. cuda/cccl/parallel/experimental/iterators/_factories.py +214 -0
  1937. cuda/cccl/parallel/experimental/iterators/_iterators.py +627 -0
  1938. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +207 -0
  1939. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1940. cuda/cccl/parallel/experimental/op.py +3 -0
  1941. cuda/cccl/parallel/experimental/struct.py +272 -0
  1942. cuda/cccl/parallel/experimental/typing.py +35 -0
  1943. cuda/cccl/py.typed +0 -0
  1944. cuda_cccl-0.1.3.2.0.dev271.dist-info/METADATA +40 -0
  1945. cuda_cccl-0.1.3.2.0.dev271.dist-info/RECORD +1947 -0
  1946. cuda_cccl-0.1.3.2.0.dev271.dist-info/WHEEL +5 -0
  1947. cuda_cccl-0.1.3.2.0.dev271.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1898 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/detail/temporary_storage.cuh>
47
+ #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
48
+ #include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
49
+ #include <cub/device/dispatch/dispatch_reduce_nondeterministic.cuh>
50
+ #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
51
+ #include <cub/thread/thread_operators.cuh>
52
+ #include <cub/util_type.cuh>
53
+
54
+ #include <thrust/iterator/tabulate_output_iterator.h>
55
+
56
+ #include <cuda/__execution/determinism.h>
57
+ #include <cuda/__execution/require.h>
58
+ #include <cuda/__execution/tune.h>
59
+ #include <cuda/__memory_resource/get_memory_resource.h>
60
+ #include <cuda/__stream/get_stream.h>
61
+ #include <cuda/std/__execution/env.h>
62
+ #include <cuda/std/limits>
63
+ #include <cuda/std/type_traits>
64
+
65
+ CUB_NAMESPACE_BEGIN
66
+
67
+ namespace detail
68
+ {
69
+
70
+ template <typename DeterminismT>
71
+ inline constexpr bool is_non_deterministic_v =
72
+ ::cuda::std::is_same_v<DeterminismT, ::cuda::execution::determinism::not_guaranteed_t>;
73
+
74
+ namespace reduce
75
+ {
76
+
77
+ struct get_tuning_query_t
78
+ {};
79
+
80
+ template <class Derived>
81
+ struct tuning
82
+ {
83
+ [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
84
+ {
85
+ return static_cast<const Derived&>(*this);
86
+ }
87
+ };
88
+
89
+ struct default_tuning : tuning<default_tuning>
90
+ {
91
+ template <class AccumT, class Offset, class OpT>
92
+ using fn = policy_hub<AccumT, Offset, OpT>;
93
+ };
94
+
95
+ struct default_rfa_tuning : tuning<default_tuning>
96
+ {
97
+ template <class AccumT, class Offset, class OpT>
98
+ using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
99
+ };
100
+
101
+ template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
102
+ struct unzip_and_write_arg_extremum_op
103
+ {
104
+ ExtremumOutIteratorT result_out_it;
105
+ IndexOutIteratorT index_out_it;
106
+
107
+ template <typename IndexT, typename KeyValuePairT>
108
+ _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(IndexT, KeyValuePairT reduced_result)
109
+ {
110
+ *result_out_it = reduced_result.value;
111
+ *index_out_it = reduced_result.key;
112
+ }
113
+ };
114
+ } // namespace reduce
115
+
116
+ // TODO(gevtushenko): move cudax `device_memory_resource` to `cuda::__device_memory_resource` and use it here
117
+ struct device_memory_resource
118
+ {
119
+ void* allocate(size_t bytes, size_t /* alignment */)
120
+ {
121
+ void* ptr{nullptr};
122
+ _CCCL_TRY_CUDA_API(::cudaMalloc, "allocate failed to allocate with cudaMalloc", &ptr, bytes);
123
+ return ptr;
124
+ }
125
+
126
+ void deallocate(void* ptr, size_t /* bytes */)
127
+ {
128
+ _CCCL_ASSERT_CUDA_API(::cudaFree, "deallocate failed", ptr);
129
+ }
130
+
131
+ void* allocate(::cuda::stream_ref stream, size_t bytes, size_t /* alignment */)
132
+ {
133
+ return allocate(stream, bytes);
134
+ }
135
+
136
+ void* allocate(::cuda::stream_ref stream, size_t bytes)
137
+ {
138
+ void* ptr{nullptr};
139
+ _CCCL_TRY_CUDA_API(::cudaMallocAsync, "allocate failed to allocate with cudaMallocAsync", &ptr, bytes, stream.get());
140
+ return ptr;
141
+ }
142
+
143
+ void deallocate(const ::cuda::stream_ref stream, void* ptr, size_t /* bytes */)
144
+ {
145
+ _CCCL_ASSERT_CUDA_API(::cudaFreeAsync, "deallocate failed", ptr, stream.get());
146
+ }
147
+ };
148
+
149
+ } // namespace detail
150
+
151
+ //! @rst
152
+ //! DeviceReduce provides device-wide, parallel operations for computing
153
+ //! a reduction across a sequence of data items residing within
154
+ //! device-accessible memory.
155
+ //!
156
+ //! .. image:: ../../img/reduce_logo.png
157
+ //! :align: center
158
+ //!
159
+ //! Overview
160
+ //! ====================================
161
+ //!
162
+ //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
163
+ //! (or *fold*) uses a binary combining operator to compute a single aggregate
164
+ //! from a sequence of input elements.
165
+ //!
166
+ //! Usage Considerations
167
+ //! ====================================
168
+ //!
169
+ //! @cdp_class{DeviceReduce}
170
+ //!
171
+ //! Performance
172
+ //! ====================================
173
+ //!
174
+ //! @linear_performance{reduction, reduce-by-key, and run-length encode}
175
+ //!
176
+ //! @endrst
177
+ struct DeviceReduce
178
+ {
179
+ private:
180
+ template <typename TuningEnvT,
181
+ typename InputIteratorT,
182
+ typename OutputIteratorT,
183
+ typename ReductionOpT,
184
+ typename T,
185
+ typename NumItemsT,
186
+ ::cuda::execution::determinism::__determinism_t Determinism>
187
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
188
+ void* d_temp_storage,
189
+ size_t& temp_storage_bytes,
190
+ InputIteratorT d_in,
191
+ OutputIteratorT d_out,
192
+ NumItemsT num_items,
193
+ ReductionOpT reduction_op,
194
+ T init,
195
+ ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
196
+ cudaStream_t stream)
197
+ {
198
+ using offset_t = detail::choose_offset_t<NumItemsT>;
199
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
200
+ using transform_t = ::cuda::std::identity;
201
+ using reduce_tuning_t = ::cuda::std::execution::
202
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
203
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
204
+ using dispatch_t =
205
+ DispatchReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, T, accum_t, transform_t, policy_t>;
206
+
207
+ return dispatch_t::Dispatch(
208
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), reduction_op, init, stream);
209
+ }
210
+
211
+ template <typename TuningEnvT,
212
+ typename InputIteratorT,
213
+ typename OutputIteratorT,
214
+ typename ReductionOpT,
215
+ typename T,
216
+ typename NumItemsT>
217
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
218
+ void* d_temp_storage,
219
+ size_t& temp_storage_bytes,
220
+ InputIteratorT d_in,
221
+ OutputIteratorT d_out,
222
+ NumItemsT num_items,
223
+ ReductionOpT,
224
+ T init,
225
+ ::cuda::execution::determinism::gpu_to_gpu_t,
226
+ cudaStream_t stream)
227
+ {
228
+ using offset_t = detail::choose_offset_t<NumItemsT>;
229
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
230
+
231
+ using transform_t = ::cuda::std::identity;
232
+ using reduce_tuning_t = ::cuda::std::execution::
233
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_rfa_tuning>;
234
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
235
+ using dispatch_t =
236
+ detail::DispatchReduceDeterministic<InputIteratorT, OutputIteratorT, offset_t, T, transform_t, accum_t, policy_t>;
237
+
238
+ return dispatch_t::Dispatch(
239
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream);
240
+ }
241
+
242
+ template <typename TuningEnvT,
243
+ typename InputIteratorT,
244
+ typename OutputIteratorT,
245
+ typename ReductionOpT,
246
+ typename T,
247
+ typename NumItemsT>
248
+ CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
249
+ void* d_temp_storage,
250
+ size_t& temp_storage_bytes,
251
+ InputIteratorT d_in,
252
+ OutputIteratorT d_out,
253
+ NumItemsT num_items,
254
+ ReductionOpT reduction_op,
255
+ T init,
256
+ ::cuda::execution::determinism::not_guaranteed_t,
257
+ cudaStream_t stream)
258
+ {
259
+ using offset_t = detail::choose_offset_t<NumItemsT>;
260
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
261
+
262
+ using output_t = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator_t<OutputIteratorT>;
263
+
264
+ using transform_t = ::cuda::std::identity;
265
+ using reduce_tuning_t = ::cuda::std::execution::
266
+ __query_result_or_t<TuningEnvT, detail::reduce::get_tuning_query_t, detail::reduce::default_tuning>;
267
+ using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
268
+ using dispatch_t = detail::
269
+ DispatchReduceNondeterministic<InputIteratorT, output_t, offset_t, ReductionOpT, T, accum_t, transform_t, policy_t>;
270
+
271
+ return dispatch_t::Dispatch(
272
+ d_temp_storage,
273
+ temp_storage_bytes,
274
+ d_in,
275
+ THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(d_out),
276
+ static_cast<offset_t>(num_items),
277
+ reduction_op,
278
+ init,
279
+ stream);
280
+ }
281
+
282
+ public:
283
+ //! @rst
284
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
285
+ //!
286
+ //! - Does not support binary reduction operators that are non-commutative.
287
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
288
+ //! (e.g., addition of floating point types) on the same GPU device.
289
+ //! However, results for pseudo-associative reduction may be inconsistent
290
+ //! from one device to a another device of a different compute-capability
291
+ //! because CUB can employ different tile-sizing for different architectures.
292
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
293
+ //! - @devicestorage
294
+ //!
295
+ //! Snippet
296
+ //! +++++++++++++++++++++++++++++++++++++++++++++
297
+ //!
298
+ //! The code snippet below illustrates a user-defined min-reduction of a
299
+ //! device vector of ``int`` data elements.
300
+ //!
301
+ //! .. code-block:: c++
302
+ //!
303
+ //! #include <cub/cub.cuh>
304
+ //! // or equivalently <cub/device/device_reduce.cuh>
305
+ //!
306
+ //! // CustomMin functor
307
+ //! struct CustomMin
308
+ //! {
309
+ //! template <typename T>
310
+ //! __device__ __forceinline__
311
+ //! T operator()(const T &a, const T &b) const {
312
+ //! return (b < a) ? b : a;
313
+ //! }
314
+ //! };
315
+ //!
316
+ //! // Declare, allocate, and initialize device-accessible pointers for
317
+ //! // input and output
318
+ //! int num_items; // e.g., 7
319
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
320
+ //! int *d_out; // e.g., [-]
321
+ //! CustomMin min_op;
322
+ //! int init; // e.g., INT_MAX
323
+ //! ...
324
+ //!
325
+ //! // Determine temporary device storage requirements
326
+ //! void *d_temp_storage = nullptr;
327
+ //! size_t temp_storage_bytes = 0;
328
+ //! cub::DeviceReduce::Reduce(
329
+ //! d_temp_storage, temp_storage_bytes,
330
+ //! d_in, d_out, num_items, min_op, init);
331
+ //!
332
+ //! // Allocate temporary storage
333
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
334
+ //!
335
+ //! // Run reduction
336
+ //! cub::DeviceReduce::Reduce(
337
+ //! d_temp_storage, temp_storage_bytes,
338
+ //! d_in, d_out, num_items, min_op, init);
339
+ //!
340
+ //! // d_out <-- [0]
341
+ //!
342
+ //! @endrst
343
+ //!
344
+ //! @tparam InputIteratorT
345
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
346
+ //!
347
+ //! @tparam OutputIteratorT
348
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
349
+ //!
350
+ //! @tparam ReductionOpT
351
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
352
+ //!
353
+ //! @tparam T
354
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
355
+ //!
356
+ //! @tparam NumItemsT
357
+ //! **[inferred]** Type of num_items
358
+ //!
359
+ //! @param[in] d_temp_storage
360
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
361
+ //! required allocation size is written to `temp_storage_bytes` and no work
362
+ //! is done.
363
+ //!
364
+ //! @param[in,out] temp_storage_bytes
365
+ //! Reference to size in bytes of `d_temp_storage` allocation
366
+ //!
367
+ //! @param[in] d_in
368
+ //! Pointer to the input sequence of data items
369
+ //!
370
+ //! @param[out] d_out
371
+ //! Pointer to the output aggregate
372
+ //!
373
+ //! @param[in] num_items
374
+ //! Total number of input items (i.e., length of `d_in`)
375
+ //!
376
+ //! @param[in] reduction_op
377
+ //! Binary reduction functor
378
+ //!
379
+ //! @param[in] init
380
+ //! Initial value of the reduction
381
+ //!
382
+ //! @param[in] stream
383
+ //! @rst
384
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
385
+ //! @endrst
386
+ template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
387
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
388
+ void* d_temp_storage,
389
+ size_t& temp_storage_bytes,
390
+ InputIteratorT d_in,
391
+ OutputIteratorT d_out,
392
+ NumItemsT num_items,
393
+ ReductionOpT reduction_op,
394
+ T init,
395
+ cudaStream_t stream = 0)
396
+ {
397
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
398
+
399
+ // Signed integer type for global offsets
400
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
401
+
402
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
403
+ d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
404
+ }
405
+
406
+ //! @rst
407
+ //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
408
+ //!
409
+ //! - Does not support binary reduction operators that are non-commutative.
410
+ //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
411
+ //! (e.g., addition of floating point types) on the same GPU device.
412
+ //! However, results for pseudo-associative reduction may be inconsistent
413
+ //! from one device to a another device of a different compute-capability
414
+ //! because CUB can employ different tile-sizing for different architectures.
415
+ //! To request "gpu-to-gpu" determinism, pass `cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)`
416
+ //! as the `env` parameter.
417
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
418
+ //!
419
+ //! Snippet
420
+ //! +++++++++++++++++++++++++++++++++++++++++++++
421
+ //!
422
+ //! The code snippet below illustrates a user-defined min-reduction of a
423
+ //! device vector of ``int`` data elements.
424
+ //!
425
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
426
+ //! :language: c++
427
+ //! :dedent:
428
+ //! :start-after: example-begin reduce-env-determinism
429
+ //! :end-before: example-end reduce-env-determinism
430
+ //!
431
+ //! @endrst
432
+ //!
433
+ //! @tparam InputIteratorT
434
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
435
+ //!
436
+ //! @tparam OutputIteratorT
437
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
438
+ //!
439
+ //! @tparam ReductionOpT
440
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
441
+ //!
442
+ //! @tparam T
443
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
444
+ //!
445
+ //! @tparam NumItemsT
446
+ //! **[inferred]** Type of num_items
447
+ //!
448
+ //! @tparam EnvT
449
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
450
+ //!
451
+ //! @param[in] d_in
452
+ //! Pointer to the input sequence of data items
453
+ //!
454
+ //! @param[out] d_out
455
+ //! Pointer to the output aggregate
456
+ //!
457
+ //! @param[in] num_items
458
+ //! Total number of input items (i.e., length of `d_in`)
459
+ //!
460
+ //! @param[in] reduction_op
461
+ //! Binary reduction functor
462
+ //!
463
+ //! @param[in] init
464
+ //! Initial value of the reduction
465
+ //!
466
+ //! @param[in] env
467
+ //! @rst
468
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
469
+ //! @endrst
470
+ template <typename InputIteratorT,
471
+ typename OutputIteratorT,
472
+ typename ReductionOpT,
473
+ typename T,
474
+ typename NumItemsT,
475
+ typename EnvT = ::cuda::std::execution::env<>>
476
+ CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
477
+ InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
478
+ {
479
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
480
+
481
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
482
+ "Determinism should be used inside requires to have an effect.");
483
+ using requirements_t = ::cuda::std::execution::
484
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
485
+ using default_determinism_t =
486
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
487
+ ::cuda::execution::determinism::__get_determinism_t,
488
+ ::cuda::execution::determinism::run_to_run_t>;
489
+
490
+ using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
491
+
492
+ constexpr auto gpu_gpu_determinism =
493
+ ::cuda::std::is_same_v<default_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>;
494
+
495
+ // integral types are always gpu-to-gpu deterministic if reduction operator is a simple cuda binary
496
+ // operator, so fallback to run-to-run determinism
497
+ constexpr auto integral_fallback =
498
+ gpu_gpu_determinism && ::cuda::std::is_integral_v<accum_t> && (detail::is_cuda_binary_operator<ReductionOpT>);
499
+
500
+ // use gpu-to-gpu determinism only for float and double types with ::cuda::std::plus operator
501
+ constexpr auto float_double_plus =
502
+ gpu_gpu_determinism && detail::is_one_of_v<accum_t, float, double> && detail::is_cuda_std_plus_v<ReductionOpT>;
503
+
504
+ constexpr auto supported = integral_fallback || float_double_plus || !gpu_gpu_determinism;
505
+
506
+ // gpu_to_gpu determinism is only supported for integral types with cuda operators, or
507
+ // float and double types with ::cuda::std::plus operator
508
+ static_assert(supported, "gpu_to_gpu determinism is unsupported");
509
+
510
+ if constexpr (!supported)
511
+ {
512
+ return cudaErrorNotSupported;
513
+ }
514
+ else
515
+ {
516
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
517
+
518
+ // Certain conditions must be met to be able to use the non-deterministic
519
+ // kernel. The output iterator must be a contiguous iterator and the
520
+ // reduction operator must be plus (for now). Additionally, since atomics for types of
521
+ // size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
522
+ // determinism.
523
+ constexpr auto is_contiguous_fallback =
524
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
525
+ constexpr auto is_plus_fallback = !no_determinism || detail::is_cuda_std_plus_v<ReductionOpT>;
526
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(accum_t) >= 4;
527
+
528
+ // If the conditions for gpu-to-gpu determinism or non-deterministic
529
+ // reduction are not met, we fall back to run-to-run determinism.
530
+ using determinism_t = ::cuda::std::conditional_t<
531
+ (gpu_gpu_determinism && integral_fallback)
532
+ || (no_determinism && !(is_contiguous_fallback && is_plus_fallback && is_4b_or_greater)),
533
+ ::cuda::execution::determinism::run_to_run_t,
534
+ default_determinism_t>;
535
+
536
+ // Query relevant properties from the environment
537
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
538
+ auto mr =
539
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
540
+
541
+ void* d_temp_storage = nullptr;
542
+ size_t temp_storage_bytes = 0;
543
+
544
+ using tuning_t = ::cuda::std::execution::
545
+ __query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
546
+
547
+ // Query the required temporary storage size
548
+ cudaError_t error = reduce_impl<tuning_t>(
549
+ d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, determinism_t{}, stream.get());
550
+ if (error != cudaSuccess)
551
+ {
552
+ return error;
553
+ }
554
+
555
+ // TODO(gevtushenko): use uninitialized buffer whenit's available
556
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
557
+ if (error != cudaSuccess)
558
+ {
559
+ return error;
560
+ }
561
+
562
+ // Run the algorithm
563
+ error = reduce_impl<tuning_t>(
564
+ d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, determinism_t{}, stream.get());
565
+
566
+ // Try to deallocate regardless of the error to avoid memory leaks
567
+ cudaError_t deallocate_error =
568
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
569
+
570
+ if (error != cudaSuccess)
571
+ {
572
+ // Reduction error takes precedence over deallocation error since it happens first
573
+ return error;
574
+ }
575
+
576
+ return deallocate_error;
577
+ }
578
+ }
579
+
580
+ //! @rst
581
+ //! Computes a device-wide sum using the addition (``+``) operator.
582
+ //!
583
+ //! - Uses ``0`` as the initial value of the reduction.
584
+ //! - Does not support ``+`` operators that are non-commutative.
585
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
586
+ //! (e.g., addition of floating point types) on the same GPU device.
587
+ //! However, results for pseudo-associative reduction may be inconsistent
588
+ //! from one device to a another device of a different compute-capability
589
+ //! because CUB can employ different tile-sizing for different architectures.
590
+ //! To request "gpu-to-gpu" determinism, pass `cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)`
591
+ //! as the `env` parameter.
592
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
593
+ //!
594
+ //! Snippet
595
+ //! +++++++++++++++++++++++++++++++++++++++++++++
596
+ //!
597
+ //! The code snippet below illustrates a user-defined min-reduction of a
598
+ //! device vector of ``int`` data elements.
599
+ //!
600
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
601
+ //! :language: c++
602
+ //! :dedent:
603
+ //! :start-after: example-begin sum-env-determinism
604
+ //! :end-before: example-end sum-env-determinism
605
+ //!
606
+ //! @endrst
607
+ //!
608
+ //! @tparam InputIteratorT
609
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
610
+ //!
611
+ //! @tparam OutputIteratorT
612
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
613
+ //!
614
+ //! @tparam NumItemsT
615
+ //! **[inferred]** Type of num_items
616
+ //!
617
+ //! @tparam EnvT
618
+ //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
619
+ //!
620
+ //! @param[in] d_in
621
+ //! Pointer to the input sequence of data items
622
+ //!
623
+ //! @param[out] d_out
624
+ //! Pointer to the output aggregate
625
+ //!
626
+ //! @param[in] num_items
627
+ //! Total number of input items (i.e., length of `d_in`)
628
+ //!
629
+ //! @param[in] env
630
+ //! @rst
631
+ //! **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
632
+ //! @endrst
633
+ template <typename InputIteratorT,
634
+ typename OutputIteratorT,
635
+ typename NumItemsT,
636
+ typename EnvT = ::cuda::std::execution::env<>>
637
+ CUB_RUNTIME_FUNCTION static cudaError_t
638
+ Sum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
639
+ {
640
+ _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
641
+
642
+ static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
643
+ "Determinism should be used inside requires to have an effect.");
644
+ using requirements_t = ::cuda::std::execution::
645
+ __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
646
+ using default_determinism_t =
647
+ ::cuda::std::execution::__query_result_or_t<requirements_t, //
648
+ ::cuda::execution::determinism::__get_determinism_t,
649
+ ::cuda::execution::determinism::run_to_run_t>;
650
+
651
+ constexpr auto no_determinism = detail::is_non_deterministic_v<default_determinism_t>;
652
+
653
+ // The output iterator must be a contiguous iterator or we fall back to
654
+ // run-to-run determinism.
655
+ constexpr auto is_contiguous_fallback =
656
+ !no_determinism || THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>;
657
+
658
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
659
+
660
+ // Since atomics for types of size < 4B are emulated, they perform poorly, so we fall back to the run-to-run
661
+ // determinism.
662
+ constexpr auto is_4b_or_greater = !no_determinism || sizeof(OutputT) >= 4;
663
+
664
+ using determinism_t =
665
+ ::cuda::std::conditional_t<no_determinism && !(is_contiguous_fallback && is_4b_or_greater),
666
+ ::cuda::execution::determinism::run_to_run_t,
667
+ default_determinism_t>;
668
+
669
+ // Query relevant properties from the environment
670
+ auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
671
+ auto mr =
672
+ ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
673
+
674
+ void* d_temp_storage = nullptr;
675
+ size_t temp_storage_bytes = 0;
676
+
677
+ using tuning_t =
678
+ ::cuda::std::execution::__query_result_or_t<EnvT, ::cuda::execution::__get_tuning_t, ::cuda::std::execution::env<>>;
679
+
680
+ using InitT = OutputT;
681
+
682
+ // Query the required temporary storage size
683
+ cudaError_t error = reduce_impl<tuning_t>(
684
+ d_temp_storage,
685
+ temp_storage_bytes,
686
+ d_in,
687
+ d_out,
688
+ num_items,
689
+ ::cuda::std::plus<>{},
690
+ InitT{}, // zero-initialize
691
+ determinism_t{},
692
+ stream.get());
693
+ if (error != cudaSuccess)
694
+ {
695
+ return error;
696
+ }
697
+
698
+ // TODO(gevtushenko): use uninitialized buffer when it's available
699
+ error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
700
+ if (error != cudaSuccess)
701
+ {
702
+ return error;
703
+ }
704
+
705
+ // Run the algorithm
706
+ error = reduce_impl<tuning_t>(
707
+ d_temp_storage,
708
+ temp_storage_bytes,
709
+ d_in,
710
+ d_out,
711
+ num_items,
712
+ ::cuda::std::plus<>{},
713
+ InitT{}, // zero-initialize
714
+ determinism_t{},
715
+ stream.get());
716
+
717
+ // Try to deallocate regardless of the error to avoid memory leaks
718
+ cudaError_t deallocate_error =
719
+ CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
720
+
721
+ if (error != cudaSuccess)
722
+ {
723
+ // Reduction error takes precedence over deallocation error since it happens first
724
+ return error;
725
+ }
726
+
727
+ return deallocate_error;
728
+ }
729
+
730
+ //! @rst
731
+ //! Computes a device-wide sum using the addition (``+``) operator.
732
+ //!
733
+ //! - Uses ``0`` as the initial value of the reduction.
734
+ //! - Does not support ``+`` operators that are non-commutative.
735
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
736
+ //! (e.g., addition of floating point types) on the same GPU device.
737
+ //! However, results for pseudo-associative reduction may be inconsistent
738
+ //! from one device to a another device of a different compute-capability
739
+ //! because CUB can employ different tile-sizing for different architectures.
740
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
741
+ //! - @devicestorage
742
+ //!
743
+ //! Snippet
744
+ //! +++++++++++++++++++++++++++++++++++++++++++++
745
+ //!
746
+ //! The code snippet below illustrates the sum-reduction of a device vector
747
+ //! of ``int`` data elements.
748
+ //!
749
+ //! .. code-block:: c++
750
+ //!
751
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
752
+ //!
753
+ //! // Declare, allocate, and initialize device-accessible pointers
754
+ //! // for input and output
755
+ //! int num_items; // e.g., 7
756
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
757
+ //! int *d_out; // e.g., [-]
758
+ //! ...
759
+ //!
760
+ //! // Determine temporary device storage requirements
761
+ //! void *d_temp_storage = nullptr;
762
+ //! size_t temp_storage_bytes = 0;
763
+ //! cub::DeviceReduce::Sum(
764
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
765
+ //!
766
+ //! // Allocate temporary storage
767
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
768
+ //!
769
+ //! // Run sum-reduction
770
+ //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
771
+ //!
772
+ //! // d_out <-- [38]
773
+ //!
774
+ //! @endrst
775
+ //!
776
+ //! @tparam InputIteratorT
777
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
778
+ //!
779
+ //! @tparam OutputIteratorT
780
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
781
+ //!
782
+ //! @tparam NumItemsT
783
+ //! **[inferred]** Type of num_items
784
+ //!
785
+ //! @param[in] d_temp_storage
786
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
787
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
788
+ //!
789
+ //! @param[in,out] temp_storage_bytes
790
+ //! Reference to size in bytes of `d_temp_storage` allocation
791
+ //!
792
+ //! @param[in] d_in
793
+ //! Pointer to the input sequence of data items
794
+ //!
795
+ //! @param[out] d_out
796
+ //! Pointer to the output aggregate
797
+ //!
798
+ //! @param[in] num_items
799
+ //! Total number of input items (i.e., length of `d_in`)
800
+ //!
801
+ //! @param[in] stream
802
+ //! @rst
803
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
804
+ //! @endrst
805
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
806
+ CUB_RUNTIME_FUNCTION static cudaError_t
807
+ Sum(void* d_temp_storage,
808
+ size_t& temp_storage_bytes,
809
+ InputIteratorT d_in,
810
+ OutputIteratorT d_out,
811
+ NumItemsT num_items,
812
+ cudaStream_t stream = 0)
813
+ {
814
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
815
+
816
+ // Signed integer type for global offsets
817
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
818
+
819
+ // The output value type
820
+ using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
821
+
822
+ using InitT = OutputT;
823
+
824
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::std::plus<>, InitT>::Dispatch(
825
+ d_temp_storage,
826
+ temp_storage_bytes,
827
+ d_in,
828
+ d_out,
829
+ static_cast<OffsetT>(num_items),
830
+ ::cuda::std::plus<>{},
831
+ InitT{}, // zero-initialize
832
+ stream);
833
+ }
834
+
835
+ //! @rst
836
+ //! Computes a device-wide minimum using the less-than (``<``) operator.
837
+ //!
838
+ //! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
839
+ //! - Does not support ``<`` operators that are non-commutative.
840
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
841
+ //! (e.g., addition of floating point types) on the same GPU device.
842
+ //! However, results for pseudo-associative reduction may be inconsistent
843
+ //! from one device to a another device of a different compute-capability
844
+ //! because CUB can employ different tile-sizing for different architectures.
845
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
846
+ //! - @devicestorage
847
+ //!
848
+ //! Snippet
849
+ //! +++++++++++++++++++++++++++++++++++++++++++++
850
+ //!
851
+ //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
852
+ //!
853
+ //! .. code-block:: c++
854
+ //!
855
+ //! #include <cub/cub.cuh>
856
+ //! // or equivalently <cub/device/device_reduce.cuh>
857
+ //!
858
+ //! // Declare, allocate, and initialize device-accessible pointers
859
+ //! // for input and output
860
+ //! int num_items; // e.g., 7
861
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
862
+ //! int *d_out; // e.g., [-]
863
+ //! ...
864
+ //!
865
+ //! // Determine temporary device storage requirements
866
+ //! void *d_temp_storage = nullptr;
867
+ //! size_t temp_storage_bytes = 0;
868
+ //! cub::DeviceReduce::Min(
869
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
870
+ //!
871
+ //! // Allocate temporary storage
872
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
873
+ //!
874
+ //! // Run min-reduction
875
+ //! cub::DeviceReduce::Min(
876
+ //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
877
+ //!
878
+ //! // d_out <-- [0]
879
+ //!
880
+ //! @endrst
881
+ //!
882
+ //! @tparam InputIteratorT
883
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
884
+ //!
885
+ //! @tparam OutputIteratorT
886
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
887
+ //!
888
+ //! @tparam NumItemsT
889
+ //! **[inferred]** Type of num_items
890
+ //!
891
+ //! @param[in] d_temp_storage
892
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
893
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
894
+ //!
895
+ //! @param[in,out] temp_storage_bytes
896
+ //! Reference to size in bytes of `d_temp_storage` allocation
897
+ //!
898
+ //! @param[in] d_in
899
+ //! Pointer to the input sequence of data items
900
+ //!
901
+ //! @param[out] d_out
902
+ //! Pointer to the output aggregate
903
+ //!
904
+ //! @param[in] num_items
905
+ //! Total number of input items (i.e., length of `d_in`)
906
+ //!
907
+ //! @param[in] stream
908
+ //! @rst
909
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
910
+ //! @endrst
911
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
912
+ CUB_RUNTIME_FUNCTION static cudaError_t
913
+ Min(void* d_temp_storage,
914
+ size_t& temp_storage_bytes,
915
+ InputIteratorT d_in,
916
+ OutputIteratorT d_out,
917
+ NumItemsT num_items,
918
+ cudaStream_t stream = 0)
919
+ {
920
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
921
+
922
+ using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
923
+ using InputT = detail::it_value_t<InputIteratorT>;
924
+ using InitT = InputT;
925
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
926
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
927
+ static_assert(limits_t::is_specialized,
928
+ "cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
929
+ "value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This is "
930
+ "probably a bug and you should specialize cuda::std::numeric_limits. Define "
931
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
932
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
933
+
934
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::minimum<>, InitT>::Dispatch(
935
+ d_temp_storage,
936
+ temp_storage_bytes,
937
+ d_in,
938
+ d_out,
939
+ static_cast<OffsetT>(num_items),
940
+ ::cuda::minimum<>{},
941
+ limits_t::max(),
942
+ stream);
943
+ }
944
+
945
+ //! @rst
946
+ //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
947
+ //!
948
+ //! - The minimum is written to ``d_min_out``
949
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
950
+ //! ``cuda::std::int64_t``.
951
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_min_out`` and the index
952
+ //! ``1`` is written to ``d_index_out``.
953
+ //! - Does not support ``<`` operators that are non-commutative.
954
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
955
+ //! (e.g., addition of floating point types) on the same GPU device.
956
+ //! However, results for pseudo-associative reduction may be inconsistent
957
+ //! from one device to a another device of a different compute-capability
958
+ //! because CUB can employ different tile-sizing for different architectures.
959
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_min_out`` nor ``d_index_out``.
960
+ //! - @devicestorage
961
+ //!
962
+ //! Snippet
963
+ //! +++++++++++++++++++++++++++++++++++++++++++++
964
+ //!
965
+ //! The code snippet below illustrates the argmin-reduction of a device vector
966
+ //! of ``int`` data elements.
967
+ //!
968
+ //! .. code-block:: c++
969
+ //!
970
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
971
+ //! #include <cuda/std/cstdint>
972
+ //!
973
+ //! // Declare, allocate, and initialize device-accessible pointers
974
+ //! // for input and output
975
+ //! int num_items; // e.g., 7
976
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
977
+ //! int *d_min_out; // memory for the minimum value
978
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
979
+ //! ...
980
+ //!
981
+ //! // Determine temporary device storage requirements
982
+ //! void *d_temp_storage = nullptr;
983
+ //! size_t temp_storage_bytes = 0;
984
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
985
+ //! num_items);
986
+ //!
987
+ //! // Allocate temporary storage
988
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
989
+ //!
990
+ //! // Run argmin-reduction
991
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_min_out, d_index_out,
992
+ //! num_items);
993
+ //!
994
+ //! // d_min_out <-- 0
995
+ //! // d_index_out <-- 5
996
+ //!
997
+ //! @endrst
998
+ //!
999
+ //! @tparam InputIteratorT
1000
+ //! **[inferred]** Random-access input iterator type for reading input items
1001
+ //! (of some type `T`) @iterator
1002
+ //!
1003
+ //! @tparam ExtremumOutIteratorT
1004
+ //! **[inferred]** Output iterator type for recording minimum value
1005
+ //!
1006
+ //! @tparam IndexOutIteratorT
1007
+ //! **[inferred]** Output iterator type for recording index of the returned value
1008
+ //!
1009
+ //! @param[in] d_temp_storage
1010
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1011
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1012
+ //!
1013
+ //! @param[in,out] temp_storage_bytes
1014
+ //! Reference to size in bytes of `d_temp_storage` allocation
1015
+ //!
1016
+ //! @param[in] d_in
1017
+ //! Iterator to the input sequence of data items
1018
+ //!
1019
+ //! @param[out] d_min_out
1020
+ //! Iterator to which the minimum value is written
1021
+ //!
1022
+ //! @param[out] d_index_out
1023
+ //! Iterator to which the index of the returned value is written
1024
+ //!
1025
+ //! @param[in] num_items
1026
+ //! Total number of input items (i.e., length of `d_in`)
1027
+ //!
1028
+ //! @param[in] stream
1029
+ //! @rst
1030
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1031
+ //! @endrst
1032
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1033
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
1034
+ void* d_temp_storage,
1035
+ size_t& temp_storage_bytes,
1036
+ InputIteratorT d_in,
1037
+ ExtremumOutIteratorT d_min_out,
1038
+ IndexOutIteratorT d_index_out,
1039
+ ::cuda::std::int64_t num_items,
1040
+ cudaStream_t stream = 0)
1041
+ {
1042
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1043
+
1044
+ // The input type
1045
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1046
+
1047
+ // Offset type used within the kernel and to index within one partition
1048
+ using PerPartitionOffsetT = int;
1049
+
1050
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1051
+ using GlobalOffsetT = ::cuda::std::int64_t;
1052
+
1053
+ // The value type used for the extremum
1054
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1055
+ using InitT = OutputExtremumT;
1056
+
1057
+ // Reduction operation
1058
+ using ReduceOpT = cub::ArgMin;
1059
+
1060
+ // Initial value
1061
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1062
+
1063
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1064
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1065
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1066
+
1067
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1068
+ InputIteratorT,
1069
+ decltype(out_it),
1070
+ PerPartitionOffsetT,
1071
+ GlobalOffsetT,
1072
+ ReduceOpT,
1073
+ InitT>::Dispatch(d_temp_storage,
1074
+ temp_storage_bytes,
1075
+ d_in,
1076
+ out_it,
1077
+ static_cast<GlobalOffsetT>(num_items),
1078
+ ReduceOpT{},
1079
+ initial_value,
1080
+ stream);
1081
+ }
1082
+
1083
+ //! @rst
1084
+ //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
1085
+ //!
1086
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1087
+ //! (assuming the value type of ``d_in`` is ``T``)
1088
+ //!
1089
+ //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
1090
+ //! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
1091
+ //!
1092
+ //! - Does not support ``<`` operators that are non-commutative.
1093
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1094
+ //! (e.g., addition of floating point types) on the same GPU device.
1095
+ //! However, results for pseudo-associative reduction may be inconsistent
1096
+ //! from one device to a another device of a different compute-capability
1097
+ //! because CUB can employ different tile-sizing for different architectures.
1098
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap `d_out`.
1099
+ //! - @devicestorage
1100
+ //!
1101
+ //! Snippet
1102
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1103
+ //!
1104
+ //! The code snippet below illustrates the argmin-reduction of a device vector
1105
+ //! of ``int`` data elements.
1106
+ //!
1107
+ //! .. code-block:: c++
1108
+ //!
1109
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1110
+ //!
1111
+ //! // Declare, allocate, and initialize device-accessible pointers
1112
+ //! // for input and output
1113
+ //! int num_items; // e.g., 7
1114
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1115
+ //! KeyValuePair<int, int> *d_argmin; // e.g., [{-,-}]
1116
+ //! ...
1117
+ //!
1118
+ //! // Determine temporary device storage requirements
1119
+ //! void *d_temp_storage = nullptr;
1120
+ //! size_t temp_storage_bytes = 0;
1121
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1122
+ //!
1123
+ //! // Allocate temporary storage
1124
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1125
+ //!
1126
+ //! // Run argmin-reduction
1127
+ //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
1128
+ //!
1129
+ //! // d_argmin <-- [{5, 0}]
1130
+ //!
1131
+ //! @endrst
1132
+ //!
1133
+ //! @tparam InputIteratorT
1134
+ //! **[inferred]** Random-access input iterator type for reading input items
1135
+ //! (of some type `T`) @iterator
1136
+ //!
1137
+ //! @tparam OutputIteratorT
1138
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1139
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1140
+ //!
1141
+ //! @param[in] d_temp_storage
1142
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1143
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1144
+ //!
1145
+ //! @param[in,out] temp_storage_bytes
1146
+ //! Reference to size in bytes of `d_temp_storage` allocation
1147
+ //!
1148
+ //! @param[in] d_in
1149
+ //! Pointer to the input sequence of data items
1150
+ //!
1151
+ //! @param[out] d_out
1152
+ //! Pointer to the output aggregate
1153
+ //!
1154
+ //! @param[in] num_items
1155
+ //! Total number of input items (i.e., length of `d_in`)
1156
+ //!
1157
+ //! @param[in] stream
1158
+ //! @rst
1159
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1160
+ //! @endrst
1161
+ template <typename InputIteratorT, typename OutputIteratorT>
1162
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMin interface that takes two separate "
1163
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1164
+ "index of the found extremum is written. ")
1165
+ CUB_RUNTIME_FUNCTION static cudaError_t
1166
+ ArgMin(void* d_temp_storage,
1167
+ size_t& temp_storage_bytes,
1168
+ InputIteratorT d_in,
1169
+ OutputIteratorT d_out,
1170
+ int num_items,
1171
+ cudaStream_t stream = 0)
1172
+ {
1173
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
1174
+
1175
+ // Signed integer type for global offsets
1176
+ using OffsetT = int;
1177
+
1178
+ // The input type
1179
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1180
+
1181
+ // The output tuple type
1182
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1183
+
1184
+ using AccumT = OutputTupleT;
1185
+
1186
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1187
+
1188
+ // The output value type
1189
+ using OutputValueT = typename OutputTupleT::Value;
1190
+
1191
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1192
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1193
+
1194
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1195
+
1196
+ // Initial value
1197
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
1198
+
1199
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
1200
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
1201
+ }
1202
+
1203
+ //! @rst
1204
+ //! Computes a device-wide maximum using the greater-than (``>``) operator.
1205
+ //!
1206
+ //! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
1207
+ //! - Does not support ``>`` operators that are non-commutative.
1208
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1209
+ //! (e.g., addition of floating point types) on the same GPU device.
1210
+ //! However, results for pseudo-associative reduction may be inconsistent
1211
+ //! from one device to a another device of a different compute-capability
1212
+ //! because CUB can employ different tile-sizing for different architectures.
1213
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1214
+ //! - @devicestorage
1215
+ //!
1216
+ //! Snippet
1217
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1218
+ //!
1219
+ //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
1220
+ //!
1221
+ //! .. code-block:: c++
1222
+ //!
1223
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1224
+ //!
1225
+ //! // Declare, allocate, and initialize device-accessible pointers
1226
+ //! // for input and output
1227
+ //! int num_items; // e.g., 7
1228
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1229
+ //! int *d_max; // e.g., [-]
1230
+ //! ...
1231
+ //!
1232
+ //! // Determine temporary device storage requirements
1233
+ //! void *d_temp_storage = nullptr;
1234
+ //! size_t temp_storage_bytes = 0;
1235
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1236
+ //!
1237
+ //! // Allocate temporary storage
1238
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1239
+ //!
1240
+ //! // Run max-reduction
1241
+ //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
1242
+ //!
1243
+ //! // d_max <-- [9]
1244
+ //!
1245
+ //! @endrst
1246
+ //!
1247
+ //! @tparam InputIteratorT
1248
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1249
+ //!
1250
+ //! @tparam OutputIteratorT
1251
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1252
+ //!
1253
+ //! @tparam NumItemsT
1254
+ //! **[inferred]** Type of num_items
1255
+ //!
1256
+ //! @param[in] d_temp_storage
1257
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1258
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1259
+ //!
1260
+ //! @param[in,out] temp_storage_bytes
1261
+ //! Reference to size in bytes of `d_temp_storage` allocation
1262
+ //!
1263
+ //! @param[in] d_in
1264
+ //! Pointer to the input sequence of data items
1265
+ //!
1266
+ //! @param[out] d_out
1267
+ //! Pointer to the output aggregate
1268
+ //!
1269
+ //! @param[in] num_items
1270
+ //! Total number of input items (i.e., length of `d_in`)
1271
+ //!
1272
+ //! @param[in] stream
1273
+ //! @rst
1274
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1275
+ //! @endrst
1276
+ template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
1277
+ CUB_RUNTIME_FUNCTION static cudaError_t
1278
+ Max(void* d_temp_storage,
1279
+ size_t& temp_storage_bytes,
1280
+ InputIteratorT d_in,
1281
+ OutputIteratorT d_out,
1282
+ NumItemsT num_items,
1283
+ cudaStream_t stream = 0)
1284
+ {
1285
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
1286
+
1287
+ // Signed integer type for global offsets
1288
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1289
+ using InputT = detail::it_value_t<InputIteratorT>;
1290
+ using InitT = InputT;
1291
+ using limits_t = ::cuda::std::numeric_limits<InitT>;
1292
+ #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1293
+ static_assert(limits_t::is_specialized,
1294
+ "cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
1295
+ "initial value, but cuda::std::numeric_limits is not specialized for the iterator's value type. This "
1296
+ "is probably a bug and you should specialize cuda::std::numeric_limits. Define "
1297
+ "CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX to suppress this check.");
1298
+ #endif // CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
1299
+
1300
+ return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ::cuda::maximum<>, InitT>::Dispatch(
1301
+ d_temp_storage,
1302
+ temp_storage_bytes,
1303
+ d_in,
1304
+ d_out,
1305
+ static_cast<OffsetT>(num_items),
1306
+ ::cuda::maximum<>{},
1307
+ limits_t::lowest(),
1308
+ stream);
1309
+ }
1310
+
1311
+ //! @rst
1312
+ //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
1313
+ //! item.
1314
+ //!
1315
+ //! - The maximum is written to ``d_max_out``
1316
+ //! - The offset of the returned item is written to ``d_index_out``, the offset type being written is of type
1317
+ //! ``cuda::std::int64_t``.
1318
+ //! - For zero-length inputs, ``cuda::std::numeric_limits<T>::max()}`` is written to ``d_max_out`` and the index
1319
+ //! ``1`` is written to ``d_index_out``.
1320
+ //! - Does not support ``>`` operators that are non-commutative.
1321
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1322
+ //! (e.g., addition of floating point types) on the same GPU device.
1323
+ //! However, results for pseudo-associative reduction may be inconsistent
1324
+ //! from one device to a another device of a different compute-capability
1325
+ //! because CUB can employ different tile-sizing for different architectures.
1326
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1327
+ //! - @devicestorage
1328
+ //!
1329
+ //! Snippet
1330
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1331
+ //!
1332
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1333
+ //! of `int` data elements.
1334
+ //!
1335
+ //! .. code-block:: c++
1336
+ //!
1337
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_reduce.cuh>
1338
+ //! #include <cuda/std/cstdint>
1339
+ //!
1340
+ //! // Declare, allocate, and initialize device-accessible pointers
1341
+ //! // for input and output
1342
+ //! int num_items; // e.g., 7
1343
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1344
+ //! int *d_max_out; // memory for the maximum value
1345
+ //! cuda::std::int64_t *d_index_out; // memory for the index of the returned value
1346
+ //! ...
1347
+ //!
1348
+ //! // Determine temporary device storage requirements
1349
+ //! void *d_temp_storage = nullptr;
1350
+ //! size_t temp_storage_bytes = 0;
1351
+ //! cub::DeviceReduce::ArgMax(
1352
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1353
+ //!
1354
+ //! // Allocate temporary storage
1355
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1356
+ //!
1357
+ //! // Run argmax-reduction
1358
+ //! cub::DeviceReduce::ArgMax(
1359
+ //! d_temp_storage, temp_storage_bytes, d_in, d_max_out, d_index_out, num_items);
1360
+ //!
1361
+ //! // d_max_out <-- 9
1362
+ //! // d_index_out <-- 6
1363
+ //!
1364
+ //! @endrst
1365
+ //!
1366
+ //! @tparam InputIteratorT
1367
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1368
+ //!
1369
+ //! @tparam ExtremumOutIteratorT
1370
+ //! **[inferred]** Output iterator type for recording maximum value
1371
+ //!
1372
+ //! @tparam IndexOutIteratorT
1373
+ //! **[inferred]** Output iterator type for recording index of the returned value
1374
+ //!
1375
+ //! @param[in] d_temp_storage
1376
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1377
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1378
+ //!
1379
+ //! @param[in,out] temp_storage_bytes
1380
+ //! Reference to size in bytes of `d_temp_storage` allocation
1381
+ //!
1382
+ //! @param[in] d_in
1383
+ //! Pointer to the input sequence of data items
1384
+ //!
1385
+ //! @param[out] d_max_out
1386
+ //! Iterator to which the maximum value is written
1387
+ //!
1388
+ //! @param[out] d_index_out
1389
+ //! Iterator to which the index of the returned value is written
1390
+ //!
1391
+ //! @param[in] num_items
1392
+ //! Total number of input items (i.e., length of `d_in`)
1393
+ //!
1394
+ //! @param[in] stream
1395
+ //! @rst
1396
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1397
+ //! @endrst
1398
+ template <typename InputIteratorT, typename ExtremumOutIteratorT, typename IndexOutIteratorT>
1399
+ CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
1400
+ void* d_temp_storage,
1401
+ size_t& temp_storage_bytes,
1402
+ InputIteratorT d_in,
1403
+ ExtremumOutIteratorT d_max_out,
1404
+ IndexOutIteratorT d_index_out,
1405
+ ::cuda::std::int64_t num_items,
1406
+ cudaStream_t stream = 0)
1407
+ {
1408
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1409
+
1410
+ // The input type
1411
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1412
+
1413
+ // Offset type used within the kernel and to index within one partition
1414
+ using PerPartitionOffsetT = int;
1415
+
1416
+ // Offset type used to index within the total input in the range [d_in, d_in + num_items)
1417
+ using GlobalOffsetT = ::cuda::std::int64_t;
1418
+
1419
+ // The value type used for the extremum
1420
+ using OutputExtremumT = detail::non_void_value_t<ExtremumOutIteratorT, InputValueT>;
1421
+ using InitT = OutputExtremumT;
1422
+
1423
+ // Reduction operation
1424
+ using ReduceOpT = cub::ArgMax;
1425
+
1426
+ // Initial value
1427
+ OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1428
+
1429
+ // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1430
+ auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1431
+ detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1432
+
1433
+ return detail::reduce::dispatch_streaming_arg_reduce_t<
1434
+ InputIteratorT,
1435
+ decltype(out_it),
1436
+ PerPartitionOffsetT,
1437
+ GlobalOffsetT,
1438
+ ReduceOpT,
1439
+ InitT>::Dispatch(d_temp_storage,
1440
+ temp_storage_bytes,
1441
+ d_in,
1442
+ out_it,
1443
+ static_cast<GlobalOffsetT>(num_items),
1444
+ ReduceOpT{},
1445
+ initial_value,
1446
+ stream);
1447
+ }
1448
+
1449
+ //! @rst
1450
+ //! Finds the first device-wide maximum using the greater-than (``>``)
1451
+ //! operator, also returning the index of that item
1452
+ //!
1453
+ //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
1454
+ //! (assuming the value type of ``d_in`` is ``T``)
1455
+ //!
1456
+ //! - The maximum is written to ``d_out.value`` and its offset in the input
1457
+ //! array is written to ``d_out.key``.
1458
+ //! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
1459
+ //!
1460
+ //! - Does not support ``>`` operators that are non-commutative.
1461
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1462
+ //! (e.g., addition of floating point types) on the same GPU device.
1463
+ //! However, results for pseudo-associative reduction may be inconsistent
1464
+ //! from one device to a another device of a different compute-capability
1465
+ //! because CUB can employ different tile-sizing for different architectures.
1466
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1467
+ //! - @devicestorage
1468
+ //!
1469
+ //! Snippet
1470
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1471
+ //!
1472
+ //! The code snippet below illustrates the argmax-reduction of a device vector
1473
+ //! of `int` data elements.
1474
+ //!
1475
+ //! .. code-block:: c++
1476
+ //!
1477
+ //! #include <cub/cub.cuh>
1478
+ //! // or equivalently <cub/device/device_reduce.cuh>
1479
+ //!
1480
+ //! // Declare, allocate, and initialize device-accessible pointers
1481
+ //! // for input and output
1482
+ //! int num_items; // e.g., 7
1483
+ //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1484
+ //! KeyValuePair<int, int> *d_argmax; // e.g., [{-,-}]
1485
+ //! ...
1486
+ //!
1487
+ //! // Determine temporary device storage requirements
1488
+ //! void *d_temp_storage = nullptr;
1489
+ //! size_t temp_storage_bytes = 0;
1490
+ //! cub::DeviceReduce::ArgMax(
1491
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1492
+ //!
1493
+ //! // Allocate temporary storage
1494
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1495
+ //!
1496
+ //! // Run argmax-reduction
1497
+ //! cub::DeviceReduce::ArgMax(
1498
+ //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
1499
+ //!
1500
+ //! // d_argmax <-- [{6, 9}]
1501
+ //!
1502
+ //! @endrst
1503
+ //!
1504
+ //! @tparam InputIteratorT
1505
+ //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
1506
+ //!
1507
+ //! @tparam OutputIteratorT
1508
+ //! **[inferred]** Output iterator type for recording the reduced aggregate
1509
+ //! (having value type `cub::KeyValuePair<int, T>`) @iterator
1510
+ //!
1511
+ //! @param[in] d_temp_storage
1512
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1513
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1514
+ //!
1515
+ //! @param[in,out] temp_storage_bytes
1516
+ //! Reference to size in bytes of `d_temp_storage` allocation
1517
+ //!
1518
+ //! @param[in] d_in
1519
+ //! Pointer to the input sequence of data items
1520
+ //!
1521
+ //! @param[out] d_out
1522
+ //! Pointer to the output aggregate
1523
+ //!
1524
+ //! @param[in] num_items
1525
+ //! Total number of input items (i.e., length of `d_in`)
1526
+ //!
1527
+ //! @param[in] stream
1528
+ //! @rst
1529
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1530
+ //! @endrst
1531
+ template <typename InputIteratorT, typename OutputIteratorT>
1532
+ CCCL_DEPRECATED_BECAUSE("CUB has superseded this interface in favor of the ArgMax interface that takes two separate "
1533
+ "iterators: one iterator to which the extremum is written and another iterator to which the "
1534
+ "index of the found extremum is written. ")
1535
+ CUB_RUNTIME_FUNCTION static cudaError_t
1536
+ ArgMax(void* d_temp_storage,
1537
+ size_t& temp_storage_bytes,
1538
+ InputIteratorT d_in,
1539
+ OutputIteratorT d_out,
1540
+ int num_items,
1541
+ cudaStream_t stream = 0)
1542
+ {
1543
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
1544
+
1545
+ // Signed integer type for global offsets
1546
+ using OffsetT = int;
1547
+
1548
+ // The input type
1549
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1550
+
1551
+ // The output tuple type
1552
+ using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1553
+
1554
+ using AccumT = OutputTupleT;
1555
+
1556
+ // The output value type
1557
+ using OutputValueT = typename OutputTupleT::Value;
1558
+
1559
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1560
+
1561
+ // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1562
+ using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1563
+
1564
+ ArgIndexInputIteratorT d_indexed_in(d_in);
1565
+
1566
+ // Initial value
1567
+ InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
1568
+
1569
+ return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
1570
+ d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
1571
+ }
1572
+
1573
+ //! @rst
1574
+ //! Fuses transform and reduce operations
1575
+ //!
1576
+ //! - Does not support binary reduction operators that are non-commutative.
1577
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1578
+ //! (e.g., addition of floating point types) on the same GPU device.
1579
+ //! However, results for pseudo-associative reduction may be inconsistent
1580
+ //! from one device to a another device of a different compute-capability
1581
+ //! because CUB can employ different tile-sizing for different architectures.
1582
+ //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
1583
+ //! - @devicestorage
1584
+ //!
1585
+ //! Snippet
1586
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1587
+ //!
1588
+ //! The code snippet below illustrates a user-defined min-reduction of a
1589
+ //! device vector of `int` data elements.
1590
+ //!
1591
+ //! .. code-block:: c++
1592
+ //!
1593
+ //! #include <cub/cub.cuh>
1594
+ //! // or equivalently <cub/device/device_reduce.cuh>
1595
+ //!
1596
+ //! thrust::device_vector<int> in = { 1, 2, 3, 4 };
1597
+ //! thrust::device_vector<int> out(1);
1598
+ //!
1599
+ //! size_t temp_storage_bytes = 0;
1600
+ //! uint8_t *d_temp_storage = nullptr;
1601
+ //!
1602
+ //! const int init = 42;
1603
+ //!
1604
+ //! cub::DeviceReduce::TransformReduce(
1605
+ //! d_temp_storage,
1606
+ //! temp_storage_bytes,
1607
+ //! in.begin(),
1608
+ //! out.begin(),
1609
+ //! in.size(),
1610
+ //! cuda::std::plus<>{},
1611
+ //! square_t{},
1612
+ //! init);
1613
+ //!
1614
+ //! thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
1615
+ //! d_temp_storage = temp_storage.data().get();
1616
+ //!
1617
+ //! cub::DeviceReduce::TransformReduce(
1618
+ //! d_temp_storage,
1619
+ //! temp_storage_bytes,
1620
+ //! in.begin(),
1621
+ //! out.begin(),
1622
+ //! in.size(),
1623
+ //! cuda::std::plus<>{},
1624
+ //! square_t{},
1625
+ //! init);
1626
+ //!
1627
+ //! // out[0] <-- 72
1628
+ //!
1629
+ //! @endrst
1630
+ //!
1631
+ //! @tparam InputIteratorT
1632
+ //! **[inferred]** Random-access input iterator type for reading input items @iterator
1633
+ //!
1634
+ //! @tparam OutputIteratorT
1635
+ //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
1636
+ //!
1637
+ //! @tparam ReductionOpT
1638
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
1639
+ //!
1640
+ //! @tparam TransformOpT
1641
+ //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
1642
+ //!
1643
+ //! @tparam T
1644
+ //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
1645
+ //!
1646
+ //! @tparam NumItemsT
1647
+ //! **[inferred]** Type of num_items
1648
+ //!
1649
+ //! @param[in] d_temp_storage
1650
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1651
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1652
+ //!
1653
+ //! @param[in,out] temp_storage_bytes
1654
+ //! Reference to size in bytes of `d_temp_storage` allocation
1655
+ //!
1656
+ //! @param[in] d_in
1657
+ //! Pointer to the input sequence of data items
1658
+ //!
1659
+ //! @param[out] d_out
1660
+ //! Pointer to the output aggregate
1661
+ //!
1662
+ //! @param[in] num_items
1663
+ //! Total number of input items (i.e., length of `d_in`)
1664
+ //!
1665
+ //! @param[in] reduction_op
1666
+ //! Binary reduction functor
1667
+ //!
1668
+ //! @param[in] transform_op
1669
+ //! Unary transform functor
1670
+ //!
1671
+ //! @param[in] init
1672
+ //! Initial value of the reduction
1673
+ //!
1674
+ //! @param[in] stream
1675
+ //! @rst
1676
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1677
+ //! @endrst
1678
+ template <typename InputIteratorT,
1679
+ typename OutputIteratorT,
1680
+ typename ReductionOpT,
1681
+ typename TransformOpT,
1682
+ typename T,
1683
+ typename NumItemsT>
1684
+ CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
1685
+ void* d_temp_storage,
1686
+ size_t& temp_storage_bytes,
1687
+ InputIteratorT d_in,
1688
+ OutputIteratorT d_out,
1689
+ NumItemsT num_items,
1690
+ ReductionOpT reduction_op,
1691
+ TransformOpT transform_op,
1692
+ T init,
1693
+ cudaStream_t stream = 0)
1694
+ {
1695
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
1696
+
1697
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1698
+
1699
+ return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
1700
+ d_temp_storage,
1701
+ temp_storage_bytes,
1702
+ d_in,
1703
+ d_out,
1704
+ static_cast<OffsetT>(num_items),
1705
+ reduction_op,
1706
+ init,
1707
+ stream,
1708
+ transform_op);
1709
+ }
1710
+
1711
+ //! @rst
1712
+ //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
1713
+ //!
1714
+ //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
1715
+ //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
1716
+ //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
1717
+ //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
1718
+ //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
1719
+ //!
1720
+ //! - The ``==`` equality operator is used to determine whether keys are equivalent
1721
+ //! - Provides "run-to-run" determinism for pseudo-associative reduction
1722
+ //! (e.g., addition of floating point types) on the same GPU device.
1723
+ //! However, results for pseudo-associative reduction may be inconsistent
1724
+ //! from one device to a another device of a different compute-capability
1725
+ //! because CUB can employ different tile-sizing for different architectures.
1726
+ //! - Let ``out`` be any of
1727
+ //! ``[d_unique_out, d_unique_out + *d_num_runs_out)``
1728
+ //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
1729
+ //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
1730
+ //! ``[d_keys_in, d_keys_in + num_items)``,
1731
+ //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
1732
+ //! - @devicestorage
1733
+ //!
1734
+ //! Snippet
1735
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1736
+ //!
1737
+ //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
1738
+ //! associated ``int`` keys.
1739
+ //!
1740
+ //! .. code-block:: c++
1741
+ //!
1742
+ //! #include <cub/cub.cuh>
1743
+ //! // or equivalently <cub/device/device_reduce.cuh>
1744
+ //!
1745
+ //! // CustomMin functor
1746
+ //! struct CustomMin
1747
+ //! {
1748
+ //! template <typename T>
1749
+ //! __device__ __forceinline__
1750
+ //! T operator()(const T &a, const T &b) const {
1751
+ //! return (b < a) ? b : a;
1752
+ //! }
1753
+ //! };
1754
+ //!
1755
+ //! // Declare, allocate, and initialize device-accessible pointers
1756
+ //! // for input and output
1757
+ //! int num_items; // e.g., 8
1758
+ //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
1759
+ //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
1760
+ //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
1761
+ //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
1762
+ //! int *d_num_runs_out; // e.g., [-]
1763
+ //! CustomMin reduction_op;
1764
+ //! ...
1765
+ //!
1766
+ //! // Determine temporary device storage requirements
1767
+ //! void *d_temp_storage = nullptr;
1768
+ //! size_t temp_storage_bytes = 0;
1769
+ //! cub::DeviceReduce::ReduceByKey(
1770
+ //! d_temp_storage, temp_storage_bytes,
1771
+ //! d_keys_in, d_unique_out, d_values_in,
1772
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
1773
+ //!
1774
+ //! // Allocate temporary storage
1775
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1776
+ //!
1777
+ //! // Run reduce-by-key
1778
+ //! cub::DeviceReduce::ReduceByKey(
1779
+ //! d_temp_storage, temp_storage_bytes,
1780
+ //! d_keys_in, d_unique_out, d_values_in,
1781
+ //! d_aggregates_out, d_num_runs_out, reduction_op, num_items);
1782
+ //!
1783
+ //! // d_unique_out <-- [0, 2, 9, 5, 8]
1784
+ //! // d_aggregates_out <-- [0, 1, 6, 2, 4]
1785
+ //! // d_num_runs_out <-- [5]
1786
+ //!
1787
+ //! @endrst
1788
+ //!
1789
+ //! @tparam KeysInputIteratorT
1790
+ //! **[inferred]** Random-access input iterator type for reading input keys @iterator
1791
+ //!
1792
+ //! @tparam UniqueOutputIteratorT
1793
+ //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator
1794
+ //!
1795
+ //! @tparam ValuesInputIteratorT
1796
+ //! **[inferred]** Random-access input iterator type for reading input values @iterator
1797
+ //!
1798
+ //! @tparam AggregatesOutputIterator
1799
+ //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
1800
+ //!
1801
+ //! @tparam NumRunsOutputIteratorT
1802
+ //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator
1803
+ //!
1804
+ //! @tparam ReductionOpT
1805
+ //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
1806
+ //!
1807
+ //! @tparam NumItemsT
1808
+ //! **[inferred]** Type of num_items
1809
+ //!
1810
+ //! @param[in] d_temp_storage
1811
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1812
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
1813
+ //!
1814
+ //! @param[in,out] temp_storage_bytes
1815
+ //! Reference to size in bytes of `d_temp_storage` allocation
1816
+ //!
1817
+ //! @param[in] d_keys_in
1818
+ //! Pointer to the input sequence of keys
1819
+ //!
1820
+ //! @param[out] d_unique_out
1821
+ //! Pointer to the output sequence of unique keys (one key per run)
1822
+ //!
1823
+ //! @param[in] d_values_in
1824
+ //! Pointer to the input sequence of corresponding values
1825
+ //!
1826
+ //! @param[out] d_aggregates_out
1827
+ //! Pointer to the output sequence of value aggregates
1828
+ //! (one aggregate per run)
1829
+ //!
1830
+ //! @param[out] d_num_runs_out
1831
+ //! Pointer to total number of runs encountered
1832
+ //! (i.e., the length of `d_unique_out`)
1833
+ //!
1834
+ //! @param[in] reduction_op
1835
+ //! Binary reduction functor
1836
+ //!
1837
+ //! @param[in] num_items
1838
+ //! Total number of associated key+value pairs
1839
+ //! (i.e., the length of `d_in_keys` and `d_in_values`)
1840
+ //!
1841
+ //! @param[in] stream
1842
+ //! @rst
1843
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1844
+ //! @endrst
1845
+ template <typename KeysInputIteratorT,
1846
+ typename UniqueOutputIteratorT,
1847
+ typename ValuesInputIteratorT,
1848
+ typename AggregatesOutputIteratorT,
1849
+ typename NumRunsOutputIteratorT,
1850
+ typename ReductionOpT,
1851
+ typename NumItemsT>
1852
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
1853
+ void* d_temp_storage,
1854
+ size_t& temp_storage_bytes,
1855
+ KeysInputIteratorT d_keys_in,
1856
+ UniqueOutputIteratorT d_unique_out,
1857
+ ValuesInputIteratorT d_values_in,
1858
+ AggregatesOutputIteratorT d_aggregates_out,
1859
+ NumRunsOutputIteratorT d_num_runs_out,
1860
+ ReductionOpT reduction_op,
1861
+ NumItemsT num_items,
1862
+ cudaStream_t stream = 0)
1863
+ {
1864
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
1865
+
1866
+ // Signed integer type for global offsets
1867
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1868
+
1869
+ // FlagT iterator type (not used)
1870
+
1871
+ // Selection op (not used)
1872
+
1873
+ // Default == operator
1874
+ using EqualityOp = ::cuda::std::equal_to<>;
1875
+
1876
+ return DispatchReduceByKey<
1877
+ KeysInputIteratorT,
1878
+ UniqueOutputIteratorT,
1879
+ ValuesInputIteratorT,
1880
+ AggregatesOutputIteratorT,
1881
+ NumRunsOutputIteratorT,
1882
+ EqualityOp,
1883
+ ReductionOpT,
1884
+ OffsetT>::Dispatch(d_temp_storage,
1885
+ temp_storage_bytes,
1886
+ d_keys_in,
1887
+ d_unique_out,
1888
+ d_values_in,
1889
+ d_aggregates_out,
1890
+ d_num_runs_out,
1891
+ EqualityOp(),
1892
+ reduction_op,
1893
+ static_cast<OffsetT>(num_items),
1894
+ stream);
1895
+ }
1896
+ };
1897
+
1898
+ CUB_NAMESPACE_END