cuda-cccl 0.1.3.1.0.dev1678__cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1860) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +273 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +935 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
  241. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  242. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
  243. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
  244. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
  245. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  247. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  248. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  249. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  259. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  260. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  261. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  262. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  263. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  264. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  265. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  266. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  267. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  268. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
  269. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  270. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  271. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  272. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  273. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  274. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  275. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  276. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  384. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  385. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  386. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  387. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
  388. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  389. cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
  390. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  391. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  392. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  393. cuda/cccl/headers/include/cuda/access_property +26 -0
  394. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  395. cuda/cccl/headers/include/cuda/atomic +27 -0
  396. cuda/cccl/headers/include/cuda/barrier +262 -0
  397. cuda/cccl/headers/include/cuda/bit +29 -0
  398. cuda/cccl/headers/include/cuda/cmath +35 -0
  399. cuda/cccl/headers/include/cuda/discard_memory +60 -0
  400. cuda/cccl/headers/include/cuda/functional +31 -0
  401. cuda/cccl/headers/include/cuda/iterator +34 -0
  402. cuda/cccl/headers/include/cuda/latch +27 -0
  403. cuda/cccl/headers/include/cuda/mdspan +28 -0
  404. cuda/cccl/headers/include/cuda/memory +32 -0
  405. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  406. cuda/cccl/headers/include/cuda/numeric +28 -0
  407. cuda/cccl/headers/include/cuda/pipeline +577 -0
  408. cuda/cccl/headers/include/cuda/ptx +124 -0
  409. cuda/cccl/headers/include/cuda/semaphore +31 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  517. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  518. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  519. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  520. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  521. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  522. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  523. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  524. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  525. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  526. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  527. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  530. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  531. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  532. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  533. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  534. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  535. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  536. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
  537. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  555. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  556. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  557. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  558. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  559. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  560. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  561. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  562. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
  563. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  564. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  565. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  566. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  567. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
  583. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  584. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
  585. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  586. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  587. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  588. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  589. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  590. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  591. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  592. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  593. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  594. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  595. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  596. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
  597. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  598. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
  599. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  600. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  601. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
  602. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  603. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  604. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  605. cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
  606. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  607. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  608. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
  609. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  610. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  611. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  612. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  613. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  614. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  615. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
  616. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  617. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  618. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  619. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  620. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  621. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  622. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  623. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  624. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  625. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  627. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  628. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  629. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  630. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  631. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  632. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  633. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  634. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  635. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  636. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  637. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  638. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  639. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  640. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
  641. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  642. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  643. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  644. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
  645. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
  646. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  647. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
  648. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  650. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  651. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  652. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  653. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  654. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  655. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  656. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  657. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  660. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  661. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  662. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  663. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  664. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
  665. cuda/cccl/headers/include/cuda/std/__format_ +28 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  667. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  668. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  669. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  670. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  671. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  672. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  673. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  674. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  675. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  676. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  677. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  678. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
  679. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  680. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  681. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  682. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  683. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  684. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  685. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  686. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  687. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  688. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  689. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  690. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  691. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  692. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  693. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  694. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  696. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  697. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  698. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  699. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  700. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  701. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  702. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  703. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  704. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  705. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  706. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  707. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  708. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  709. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  710. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  711. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  712. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  713. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  724. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  725. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
  726. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  727. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  728. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
  729. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
  731. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  732. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  733. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  734. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  735. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  736. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
  737. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  739. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
  740. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  741. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  742. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  743. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  744. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  745. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  746. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
  747. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  748. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  749. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  751. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  752. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  753. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  754. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  755. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  756. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  757. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  758. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
  759. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
  760. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  761. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  762. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
  763. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  764. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
  765. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
  766. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  767. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
  768. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  769. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  770. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  771. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  772. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  773. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  774. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  775. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  776. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  777. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  778. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
  779. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  780. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  781. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
  782. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  783. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  784. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  785. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  786. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  787. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  788. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
  789. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
  790. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  791. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  792. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  793. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  794. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  795. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  796. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  797. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  798. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  799. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  800. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  801. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  802. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  803. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  804. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  805. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  806. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  807. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  808. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  809. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  810. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
  811. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  812. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  813. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  814. cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
  815. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
  816. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  817. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  818. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  819. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  820. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
  821. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  822. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  823. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  824. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  825. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  826. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  827. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  828. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  829. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  830. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  831. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  832. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  833. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  834. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  835. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  836. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  837. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
  839. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
  840. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  841. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
  842. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  843. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  844. cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
  845. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  846. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
  847. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  848. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  849. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  850. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  851. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  852. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  853. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  854. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  855. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  856. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  857. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  858. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  859. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  860. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  861. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  862. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  863. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  864. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
  865. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  866. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  867. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  868. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  869. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  870. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  871. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  872. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  873. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  874. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  875. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  876. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1016. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1017. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  1018. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1019. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1020. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1021. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1022. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1023. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1024. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1025. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1026. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1027. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1028. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1029. cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
  1030. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1031. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
  1032. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1034. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1035. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1036. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1037. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1038. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/array +520 -0
  1040. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1041. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  1042. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1043. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1044. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1045. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1046. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1047. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1048. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1049. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1050. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1051. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1052. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1053. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1054. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1055. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1056. cuda/cccl/headers/include/cuda/std/ctime +152 -0
  1057. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1058. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1059. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
  1060. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
  1061. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
  1062. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1063. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1064. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
  1065. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
  1066. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1067. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1068. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1069. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1070. cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
  1071. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1072. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1073. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1074. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1075. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1076. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1077. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1078. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1079. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1080. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1081. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1082. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1083. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1084. cuda/cccl/headers/include/cuda/std/span +640 -0
  1085. cuda/cccl/headers/include/cuda/std/string_view +788 -0
  1086. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1087. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1088. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1089. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1090. cuda/cccl/headers/include/cuda/std/version +245 -0
  1091. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1092. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1093. cuda/cccl/headers/include/cuda/utility +27 -0
  1094. cuda/cccl/headers/include/cuda/version +16 -0
  1095. cuda/cccl/headers/include/cuda/warp +28 -0
  1096. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1097. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1098. cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
  1099. cuda/cccl/headers/include/nv/target +240 -0
  1100. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1101. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1102. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1103. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1104. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1105. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1106. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1107. cuda/cccl/headers/include/thrust/count.h +245 -0
  1108. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1109. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1110. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1111. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1112. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1113. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1114. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1115. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1116. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1117. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1118. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1119. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1120. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1121. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1122. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1123. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1124. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1125. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1126. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1127. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1128. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1129. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1130. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1131. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1132. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1133. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1134. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1135. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1136. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1137. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1138. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1139. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1140. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1141. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1142. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1143. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1144. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1145. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1146. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1147. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1148. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1149. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1150. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1151. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1152. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1153. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1154. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1155. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1156. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1157. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1158. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1159. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1160. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1161. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1162. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1163. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1164. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1165. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1166. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1167. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1168. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1169. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1170. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1171. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1172. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1173. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1174. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1175. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1176. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1177. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1178. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1179. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1180. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1181. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1182. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1183. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1184. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1185. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1186. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1187. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1188. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1189. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1190. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1191. cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
  1192. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1193. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1194. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1195. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1196. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1197. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1198. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1199. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1200. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1201. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1202. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1203. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1204. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1205. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1206. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1207. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1208. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1209. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1210. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1211. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1212. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1213. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1214. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1215. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1216. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1217. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1218. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1219. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1220. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1221. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1222. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1223. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1224. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1225. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1226. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1227. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1228. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1229. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1230. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1231. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1232. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1233. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1234. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1235. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1236. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1237. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1238. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1239. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1240. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1241. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1242. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1243. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1244. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1245. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1246. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1247. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1248. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1249. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1250. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1251. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1252. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1254. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1255. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1256. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1257. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1258. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1259. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1260. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1261. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1262. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1263. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1264. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1265. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1266. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1267. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1268. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1269. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1270. cuda/cccl/headers/include/thrust/find.h +382 -0
  1271. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1272. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1273. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1274. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1275. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1276. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1277. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1278. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1279. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1280. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1281. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1282. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1283. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1284. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1285. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1286. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1287. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1288. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1289. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1290. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1291. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1292. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1293. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1294. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1295. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1296. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1297. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1298. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
  1299. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1300. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1301. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1302. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1303. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1304. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1305. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1306. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1307. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1308. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1309. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1310. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1311. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1312. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1313. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1314. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1315. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1316. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1317. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1318. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1319. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1320. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1321. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1322. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1323. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1324. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1325. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1326. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1327. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1328. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1329. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1330. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1331. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1332. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1333. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1334. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1335. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1336. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1337. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1338. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1339. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1340. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1341. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1342. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1343. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1344. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1345. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1346. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1347. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1348. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1349. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1350. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1351. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1352. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1353. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1354. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1355. cuda/cccl/headers/include/thrust/random.h +120 -0
  1356. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1357. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1358. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1359. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1360. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1361. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1362. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1363. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1364. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1365. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1366. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1377. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1378. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1379. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1380. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1382. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1383. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1384. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1385. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1386. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1388. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1389. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1390. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1391. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1392. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1393. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1394. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1395. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1396. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1397. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1398. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1399. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1400. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1401. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1402. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1403. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1404. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1405. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1406. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1408. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1409. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1410. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1411. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1412. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1413. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1414. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1415. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1416. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1417. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1418. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1446. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
  1447. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1448. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1449. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1450. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1451. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1452. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1453. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1454. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1455. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1456. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1457. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1458. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1459. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
  1460. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1461. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1462. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1463. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1464. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1465. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1466. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1467. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1469. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1470. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1471. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1472. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1473. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1474. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1475. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1476. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1477. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1478. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1479. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
  1480. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1481. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1482. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1483. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1484. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1485. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1486. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1487. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1675. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1702. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1703. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1704. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1705. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1706. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1708. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1709. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1710. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1711. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1712. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1713. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1714. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1715. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1716. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1717. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1718. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1724. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1725. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1726. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1727. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1728. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1730. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1731. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1732. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1733. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1734. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1735. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1736. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1737. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1738. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1739. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1740. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1742. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1767. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1768. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1769. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1770. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1771. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1772. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1773. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1775. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1776. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1777. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1778. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1779. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1780. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1783. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1784. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1785. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1786. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1788. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1789. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1790. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1807. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1808. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1809. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1810. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1811. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1812. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1813. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1814. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1815. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1816. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1817. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1818. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1819. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1820. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1821. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1822. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1823. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1824. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1825. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1826. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1827. cuda/cccl/headers/include/thrust/version.h +93 -0
  1828. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1829. cuda/cccl/headers/include_paths.py +72 -0
  1830. cuda/cccl/parallel/__init__.py +9 -0
  1831. cuda/cccl/parallel/experimental/__init__.py +47 -0
  1832. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1833. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1834. cuda/cccl/parallel/experimental/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1835. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1836. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1837. cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
  1838. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1839. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1840. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1841. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1842. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1843. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1844. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1845. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1846. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1847. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1848. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1849. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1850. cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
  1851. cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
  1852. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1853. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1854. cuda/cccl/parallel/experimental/struct.py +150 -0
  1855. cuda/cccl/parallel/experimental/typing.py +27 -0
  1856. cuda/cccl/py.typed +0 -0
  1857. cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
  1858. cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
  1859. cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
  1860. cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,3435 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/device/dispatch/dispatch_radix_sort.cuh>
47
+
48
+ #include <cuda/std/type_traits>
49
+
50
+ CUB_NAMESPACE_BEGIN
51
+
52
+ //! @rst
53
+ //! DeviceRadixSort provides device-wide, parallel operations for
54
+ //! computing a radix sort across a sequence of data items residing
55
+ //! within device-accessible memory.
56
+ //!
57
+ //! .. image:: ../../img/sorting_logo.png
58
+ //! :align: center
59
+ //!
60
+ //! Overview
61
+ //! --------------------------------------------------
62
+ //!
63
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
64
+ //! arranges items into ascending (or descending) order. The algorithm relies
65
+ //! upon a positional representation for keys, i.e., each key is comprised of an
66
+ //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
67
+ //! least-significant to most-significant. For a given input sequence of keys
68
+ //! and a set of rules specifying a total ordering of the symbolic alphabet, the
69
+ //! radix sorting method produces a lexicographic ordering of those keys.
70
+ //!
71
+ //! @rowmajor
72
+ //!
73
+ //! Supported Types
74
+ //! --------------------------------------------------
75
+ //!
76
+ //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
77
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
78
+ //! and ``__nv_bfloat16`` 16-bit floating-point types. User-defined types are
79
+ //! supported as long as a decomposer object is provided.
80
+ //!
81
+ //! Floating-Point Special Cases
82
+ //! --------------------------------------------------
83
+ //!
84
+ //! - Positive and negative zeros are considered equivalent, and will be treated
85
+ //! as such in the output.
86
+ //! - No special handling is implemented for NaN values; these are sorted
87
+ //! according to their bit representations after any transformations.
88
+ //!
89
+ //! Transformations
90
+ //! --------------------------------------------------
91
+ //!
92
+ //! Although the direct radix sorting method can only be applied to unsigned
93
+ //! integral types, DeviceRadixSort is able to sort signed and floating-point
94
+ //! types via simple bit-wise transformations that ensure lexicographic key
95
+ //! ordering. Additional transformations occur for descending sorts. These
96
+ //! transformations must be considered when restricting the
97
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
98
+ //! before the bit-range truncation.
99
+ //!
100
+ //! Any transformations applied to the keys prior to sorting are reversed
101
+ //! while writing to the final output buffer.
102
+ //!
103
+ //! Type Specific Bitwise Transformations
104
+ //! --------------------------------------------------
105
+ //!
106
+ //! To convert the input values into a radix-sortable bitwise representation,
107
+ //! the following transformations take place prior to sorting:
108
+ //!
109
+ //! - For unsigned integral values, the keys are used directly.
110
+ //! - For signed integral values, the sign bit is inverted.
111
+ //! - For positive floating point values, the sign bit is inverted.
112
+ //! - For negative floating point values, the full key is inverted.
113
+ //!
114
+ //! For floating point types, positive and negative zero are a special case and
115
+ //! will be considered equivalent during sorting.
116
+ //!
117
+ //! Descending Sort Bitwise Transformations
118
+ //! --------------------------------------------------
119
+ //!
120
+ //! If descending sort is used, the keys are inverted after performing any
121
+ //! type-specific transformations, and the resulting keys are sorted in ascending
122
+ //! order.
123
+ //!
124
+ //! Stability
125
+ //! --------------------------------------------------
126
+ //!
127
+ //! DeviceRadixSort is stable. For floating-point types, ``-0.0`` and ``+0.0`` are
128
+ //! considered equal and appear in the result in the same order as they appear in
129
+ //! the input.
130
+ //!
131
+ //! Usage Considerations
132
+ //! --------------------------------------------------
133
+ //!
134
+ //! @cdp_class{DeviceRadixSort}
135
+ //!
136
+ //! Performance
137
+ //! --------------------------------------------------
138
+ //!
139
+ //! @linear_performance{radix sort}
140
+ //!
141
+ //! @endrst
142
+ struct DeviceRadixSort
143
+ {
144
+ private:
145
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
146
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
147
+ ::cuda::std::false_type,
148
+ void* d_temp_storage,
149
+ size_t& temp_storage_bytes,
150
+ bool is_overwrite_okay,
151
+ DoubleBuffer<KeyT>& d_keys,
152
+ DoubleBuffer<ValueT>& d_values,
153
+ NumItemsT num_items,
154
+ DecomposerT decomposer,
155
+ int begin_bit,
156
+ int end_bit,
157
+ cudaStream_t stream);
158
+
159
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
160
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
161
+ ::cuda::std::true_type,
162
+ void* d_temp_storage,
163
+ size_t& temp_storage_bytes,
164
+ bool is_overwrite_okay,
165
+ DoubleBuffer<KeyT>& d_keys,
166
+ DoubleBuffer<ValueT>& d_values,
167
+ OffsetT num_items,
168
+ DecomposerT decomposer,
169
+ int begin_bit,
170
+ int end_bit,
171
+ cudaStream_t stream)
172
+ {
173
+ return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, DecomposerT>::Dispatch(
174
+ d_temp_storage,
175
+ temp_storage_bytes,
176
+ d_keys,
177
+ d_values,
178
+ static_cast<OffsetT>(num_items),
179
+ begin_bit,
180
+ end_bit,
181
+ is_overwrite_okay,
182
+ stream,
183
+ decomposer);
184
+ }
185
+
186
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
187
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
188
+ ::cuda::std::false_type,
189
+ void* d_temp_storage,
190
+ size_t& temp_storage_bytes,
191
+ bool is_overwrite_okay,
192
+ DoubleBuffer<KeyT>& d_keys,
193
+ DoubleBuffer<ValueT>& d_values,
194
+ NumItemsT num_items,
195
+ DecomposerT decomposer,
196
+ cudaStream_t stream);
197
+
198
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
199
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
200
+ ::cuda::std::true_type,
201
+ void* d_temp_storage,
202
+ size_t& temp_storage_bytes,
203
+ bool is_overwrite_okay,
204
+ DoubleBuffer<KeyT>& d_keys,
205
+ DoubleBuffer<ValueT>& d_values,
206
+ OffsetT num_items,
207
+ DecomposerT decomposer,
208
+ cudaStream_t stream)
209
+ {
210
+ constexpr int begin_bit = 0;
211
+ const int end_bit = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
212
+
213
+ return DeviceRadixSort::custom_radix_sort<Order>(
214
+ ::cuda::std::true_type{},
215
+ d_temp_storage,
216
+ temp_storage_bytes,
217
+ is_overwrite_okay,
218
+ d_keys,
219
+ d_values,
220
+ num_items,
221
+ decomposer,
222
+ begin_bit,
223
+ end_bit,
224
+ stream);
225
+ }
226
+
227
+ // Name reported for NVTX ranges
228
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
229
+ {
230
+ return "cub::DeviceRadixSort";
231
+ }
232
+
233
+ public:
234
+ //! @name KeyT-value pairs
235
+ //! @{
236
+
237
+ //! @rst
238
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
239
+ //!
240
+ //! - The contents of the input data are not altered by the sorting operation.
241
+ //! - Pointers to contiguous memory must be used; iterators are not currently
242
+ //! supported.
243
+ //! - In-place operations are not supported. There must be no overlap between
244
+ //! any of the provided ranges:
245
+ //!
246
+ //! - ``[d_keys_in, d_keys_in + num_items)``
247
+ //! - ``[d_keys_out, d_keys_out + num_items)``
248
+ //! - ``[d_values_in, d_values_in + num_items)``
249
+ //! - ``[d_values_out, d_values_out + num_items)``
250
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
251
+ //! bits can be specified. This can reduce overall sorting overhead and
252
+ //! yield a corresponding performance improvement.
253
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
254
+ //! the sorting interface using DoubleBuffer wrappers below.
255
+ //! - @devicestorage
256
+ //!
257
+ //! Snippet
258
+ //! --------------------------------------------------
259
+ //!
260
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
261
+ //! keys with associated vector of ``int`` values.
262
+ //! @endrst
263
+ //!
264
+ //! @code{.cpp}
265
+ //! #include <cub/cub.cuh>
266
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
267
+ //!
268
+ //! // Declare, allocate, and initialize device-accessible pointers
269
+ //! // for sorting data
270
+ //! int num_items; // e.g., 7
271
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
272
+ //! int *d_keys_out; // e.g., [ ... ]
273
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
274
+ //! int *d_values_out; // e.g., [ ... ]
275
+ //! ...
276
+ //!
277
+ //! // Determine temporary device storage requirements
278
+ //! void *d_temp_storage = nullptr;
279
+ //! size_t temp_storage_bytes = 0;
280
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
281
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
282
+ //!
283
+ //! // Allocate temporary storage
284
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
285
+ //!
286
+ //! // Run sorting operation
287
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
288
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
289
+ //!
290
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
291
+ //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
292
+ //! @endcode
293
+ //!
294
+ //! @tparam KeyT
295
+ //! **[inferred]** KeyT type
296
+ //!
297
+ //! @tparam ValueT
298
+ //! **[inferred]** ValueT type
299
+ //!
300
+ //! @tparam NumItemsT
301
+ //! **[inferred]** Type of num_items
302
+ //!
303
+ //! @param[in] d_temp_storage
304
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
305
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
306
+ //! is done.
307
+ //!
308
+ //! @param[in,out] temp_storage_bytes
309
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
310
+ //!
311
+ //! @param[in] d_keys_in
312
+ //! Pointer to the input data of key data to sort
313
+ //!
314
+ //! @param[out] d_keys_out
315
+ //! Pointer to the sorted output sequence of key data
316
+ //!
317
+ //! @param[in] d_values_in
318
+ //! Pointer to the corresponding input sequence of associated value items
319
+ //!
320
+ //! @param[out] d_values_out
321
+ //! Pointer to the correspondingly-reordered output sequence of associated
322
+ //! value items
323
+ //!
324
+ //! @param[in] num_items
325
+ //! Number of items to sort
326
+ //!
327
+ //! @param[in] begin_bit
328
+ //! **[optional]** The least-significant bit index (inclusive) needed for
329
+ //! key comparison
330
+ //!
331
+ //! @param[in] end_bit
332
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
333
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
334
+ //!
335
+ //! @param[in] stream
336
+ //! **[optional]** CUDA stream to launch kernels within.
337
+ //! Default is stream<sub>0</sub>.
338
+ template <typename KeyT, typename ValueT, typename NumItemsT>
339
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
340
+ void* d_temp_storage,
341
+ size_t& temp_storage_bytes,
342
+ const KeyT* d_keys_in,
343
+ KeyT* d_keys_out,
344
+ const ValueT* d_values_in,
345
+ ValueT* d_values_out,
346
+ NumItemsT num_items,
347
+ int begin_bit = 0,
348
+ int end_bit = sizeof(KeyT) * 8,
349
+ cudaStream_t stream = 0)
350
+ {
351
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
352
+ // Unsigned integer type for global offsets.
353
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
354
+
355
+ // TODO API that doesn't accept decomposer should also contain a static
356
+ // assert that the key type is fundamental.
357
+
358
+ // We cast away const-ness, but will *not* write to these arrays.
359
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
360
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
361
+ // is not set.
362
+ constexpr bool is_overwrite_okay = false;
363
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
364
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
365
+
366
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
367
+ d_temp_storage,
368
+ temp_storage_bytes,
369
+ d_keys,
370
+ d_values,
371
+ static_cast<OffsetT>(num_items),
372
+ begin_bit,
373
+ end_bit,
374
+ is_overwrite_okay,
375
+ stream);
376
+ }
377
+
378
+ //! @rst
379
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
380
+ //!
381
+ //! * The contents of the input data are not altered by the sorting operation.
382
+ //! * Pointers to contiguous memory must be used; iterators are not currently
383
+ //! supported.
384
+ //! * In-place operations are not supported. There must be no overlap between
385
+ //! any of the provided ranges:
386
+ //!
387
+ //! * ``[d_keys_in, d_keys_in + num_items)``
388
+ //! * ``[d_keys_out, d_keys_out + num_items)``
389
+ //! * ``[d_values_in, d_values_in + num_items)``
390
+ //! * ``[d_values_out, d_values_out + num_items)``
391
+ //!
392
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
393
+ //! differentiating key bits. This can reduce overall sorting overhead and
394
+ //! yield a corresponding performance improvement.
395
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
396
+ //! the sorting interface using DoubleBuffer wrappers below.
397
+ //! * @devicestorage
398
+ //!
399
+ //! Snippet
400
+ //! --------------------------------------------------
401
+ //!
402
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
403
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
404
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
405
+ //! tuple of references to relevant members of the key.
406
+ //!
407
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
408
+ //! :language: c++
409
+ //! :dedent:
410
+ //! :start-after: example-begin custom-type
411
+ //! :end-before: example-end custom-type
412
+ //!
413
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
414
+ //! using ``cub::DeviceRadixSort::SortPairs``:
415
+ //!
416
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
417
+ //! :language: c++
418
+ //! :dedent:
419
+ //! :start-after: example-begin pairs-bits
420
+ //! :end-before: example-end pairs-bits
421
+ //!
422
+ //! @endrst
423
+ //!
424
+ //! @tparam KeyT
425
+ //! **[inferred]** KeyT type
426
+ //!
427
+ //! @tparam ValueT
428
+ //! **[inferred]** ValueT type
429
+ //!
430
+ //! @tparam NumItemsT
431
+ //! **[inferred]** Type of num_items
432
+ //!
433
+ //! @tparam DecomposerT
434
+ //! **[inferred]** Type of a callable object responsible for decomposing a
435
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
436
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
437
+ //! The leftmost element of the tuple is considered the most significant.
438
+ //! The call operator must not modify members of the key.
439
+ //!
440
+ //! @param[in] d_temp_storage
441
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
442
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
443
+ //! is done.
444
+ //!
445
+ //! @param[in,out] temp_storage_bytes
446
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
447
+ //!
448
+ //! @param[in] d_keys_in
449
+ //! Pointer to the input data of key data to sort
450
+ //!
451
+ //! @param[out] d_keys_out
452
+ //! Pointer to the sorted output sequence of key data
453
+ //!
454
+ //! @param[in] d_values_in
455
+ //! Pointer to the corresponding input sequence of associated value items
456
+ //!
457
+ //! @param[out] d_values_out
458
+ //! Pointer to the correspondingly-reordered output sequence of associated
459
+ //! value items
460
+ //!
461
+ //! @param[in] num_items
462
+ //! Number of items to sort
463
+ //!
464
+ //! @param decomposer
465
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
466
+ //! references to its constituent arithmetic types. The leftmost element of
467
+ //! the tuple is considered the most significant. The call operator must not
468
+ //! modify members of the key.
469
+ //!
470
+ //! @param[in] begin_bit
471
+ //! **[optional]** The least-significant bit index (inclusive) needed for
472
+ //! key comparison
473
+ //!
474
+ //! @param[in] end_bit
475
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
476
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
477
+ //!
478
+ //! @param[in] stream
479
+ //! **[optional]** CUDA stream to launch kernels within.
480
+ //! Default is stream<sub>0</sub>.
481
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
482
+ CUB_RUNTIME_FUNCTION static //
483
+ ::cuda::std::enable_if_t< //
484
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
485
+ cudaError_t>
486
+ SortPairs(void* d_temp_storage,
487
+ size_t& temp_storage_bytes,
488
+ const KeyT* d_keys_in,
489
+ KeyT* d_keys_out,
490
+ const ValueT* d_values_in,
491
+ ValueT* d_values_out,
492
+ NumItemsT num_items,
493
+ DecomposerT decomposer,
494
+ int begin_bit,
495
+ int end_bit,
496
+ cudaStream_t stream = 0)
497
+ {
498
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
499
+ // unsigned integer type for global offsets
500
+ using offset_t = detail::choose_offset_t<NumItemsT>;
501
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
502
+
503
+ static_assert(decomposer_check_t::value,
504
+ "DecomposerT must be a callable object returning a tuple of references to "
505
+ "arithmetic types");
506
+
507
+ // We cast away const-ness, but will *not* write to these arrays.
508
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
509
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
510
+ // is not set.
511
+ constexpr bool is_overwrite_okay = false;
512
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
513
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
514
+
515
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
516
+ decomposer_check_t{},
517
+ d_temp_storage,
518
+ temp_storage_bytes,
519
+ is_overwrite_okay,
520
+ d_keys,
521
+ d_values,
522
+ static_cast<offset_t>(num_items),
523
+ decomposer,
524
+ begin_bit,
525
+ end_bit,
526
+ stream);
527
+ }
528
+
529
+ //! @rst
530
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
531
+ //!
532
+ //! * The contents of the input data are not altered by the sorting operation.
533
+ //! * Pointers to contiguous memory must be used; iterators are not currently
534
+ //! supported.
535
+ //! * In-place operations are not supported. There must be no overlap between
536
+ //! any of the provided ranges:
537
+ //!
538
+ //! * ``[d_keys_in, d_keys_in + num_items)``
539
+ //! * ``[d_keys_out, d_keys_out + num_items)``
540
+ //! * ``[d_values_in, d_values_in + num_items)``
541
+ //! * ``[d_values_out, d_values_out + num_items)``
542
+ //!
543
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
544
+ //! the sorting interface using DoubleBuffer wrappers below.
545
+ //! * @devicestorage
546
+ //!
547
+ //! Snippet
548
+ //! --------------------------------------------------
549
+ //!
550
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
551
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
552
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
553
+ //! tuple of references to relevant members of the key.
554
+ //!
555
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
556
+ //! :language: c++
557
+ //! :dedent:
558
+ //! :start-after: example-begin custom-type
559
+ //! :end-before: example-end custom-type
560
+ //!
561
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
562
+ //! using ``cub::DeviceRadixSort::SortPairs``:
563
+ //!
564
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
565
+ //! :language: c++
566
+ //! :dedent:
567
+ //! :start-after: example-begin pairs
568
+ //! :end-before: example-end pairs
569
+ //!
570
+ //! @endrst
571
+ //!
572
+ //! @tparam KeyT
573
+ //! **[inferred]** KeyT type
574
+ //!
575
+ //! @tparam ValueT
576
+ //! **[inferred]** ValueT type
577
+ //!
578
+ //! @tparam NumItemsT
579
+ //! **[inferred]** Type of num_items
580
+ //!
581
+ //! @tparam DecomposerT
582
+ //! **[inferred]** Type of a callable object responsible for decomposing a
583
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
584
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
585
+ //! The leftmost element of the tuple is considered the most significant.
586
+ //! The call operator must not modify members of the key.
587
+ //!
588
+ //! @param[in] d_temp_storage
589
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
590
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
591
+ //! is done.
592
+ //!
593
+ //! @param[in,out] temp_storage_bytes
594
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
595
+ //!
596
+ //! @param[in] d_keys_in
597
+ //! Pointer to the input data of key data to sort
598
+ //!
599
+ //! @param[out] d_keys_out
600
+ //! Pointer to the sorted output sequence of key data
601
+ //!
602
+ //! @param[in] d_values_in
603
+ //! Pointer to the corresponding input sequence of associated value items
604
+ //!
605
+ //! @param[out] d_values_out
606
+ //! Pointer to the correspondingly-reordered output sequence of associated
607
+ //! value items
608
+ //!
609
+ //! @param[in] num_items
610
+ //! Number of items to sort
611
+ //!
612
+ //! @param decomposer
613
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
614
+ //! references to its constituent arithmetic types. The leftmost element of
615
+ //! the tuple is considered the most significant. The call operator must not
616
+ //! modify members of the key.
617
+ //!
618
+ //! @param[in] stream
619
+ //! **[optional]** CUDA stream to launch kernels within.
620
+ //! Default is stream<sub>0</sub>.
621
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
622
+ CUB_RUNTIME_FUNCTION static //
623
+ ::cuda::std::enable_if_t< //
624
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
625
+ cudaError_t>
626
+ SortPairs(void* d_temp_storage,
627
+ size_t& temp_storage_bytes,
628
+ const KeyT* d_keys_in,
629
+ KeyT* d_keys_out,
630
+ const ValueT* d_values_in,
631
+ ValueT* d_values_out,
632
+ NumItemsT num_items,
633
+ DecomposerT decomposer,
634
+ cudaStream_t stream = 0)
635
+ {
636
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
637
+ // unsigned integer type for global offsets
638
+ using offset_t = detail::choose_offset_t<NumItemsT>;
639
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
640
+
641
+ static_assert(decomposer_check_t::value,
642
+ "DecomposerT must be a callable object returning a tuple of references to "
643
+ "arithmetic types");
644
+
645
+ // We cast away const-ness, but will *not* write to these arrays.
646
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
647
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
648
+ // is not set.
649
+ constexpr bool is_overwrite_okay = false;
650
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
651
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
652
+
653
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
654
+ decomposer_check_t{},
655
+ d_temp_storage,
656
+ temp_storage_bytes,
657
+ is_overwrite_okay,
658
+ d_keys,
659
+ d_values,
660
+ static_cast<offset_t>(num_items),
661
+ decomposer,
662
+ stream);
663
+ }
664
+
665
+ //! @rst
666
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
667
+ //!
668
+ //! - The sorting operation is given a pair of key buffers and a corresponding
669
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
670
+ //! structure that indicates which of the two buffers is "current" (and thus
671
+ //! contains the input data to be sorted).
672
+ //! - The contents of both buffers within each pair may be altered by the
673
+ //! sorting operation.
674
+ //! - In-place operations are not supported. There must be no overlap between
675
+ //! any of the provided ranges:
676
+ //!
677
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
678
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
679
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
680
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
681
+ //!
682
+ //! - Upon completion, the sorting operation will update the "current"
683
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
684
+ //! buffers now contains the sorted output sequence (a function of the
685
+ //! number of key bits specified and the targeted device architecture).
686
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
687
+ //! bits can be specified. This can reduce overall sorting overhead and
688
+ //! yield a corresponding performance improvement.
689
+ //! - @devicestorageP
690
+ //! - @devicestorage
691
+ //!
692
+ //! Snippet
693
+ //! --------------------------------------------------
694
+ //!
695
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
696
+ //! keys with associated vector of ``int`` values.
697
+ //! @endrst
698
+ //!
699
+ //! @code
700
+ //! #include <cub/cub.cuh>
701
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
702
+ //!
703
+ //! // Declare, allocate, and initialize device-accessible pointers for
704
+ //! // sorting data
705
+ //! int num_items; // e.g., 7
706
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
707
+ //! int *d_key_alt_buf; // e.g., [ ... ]
708
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
709
+ //! int *d_value_alt_buf; // e.g., [ ... ]
710
+ //! ...
711
+ //!
712
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
713
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
714
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
715
+ //!
716
+ //! // Determine temporary device storage requirements
717
+ //! void *d_temp_storage = nullptr;
718
+ //! size_t temp_storage_bytes = 0;
719
+ //! cub::DeviceRadixSort::SortPairs(
720
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
721
+ //!
722
+ //! // Allocate temporary storage
723
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
724
+ //!
725
+ //! // Run sorting operation
726
+ //! cub::DeviceRadixSort::SortPairs(
727
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
728
+ //!
729
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
730
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
731
+ //!
732
+ //! @endcode
733
+ //!
734
+ //! @tparam KeyT
735
+ //! **[inferred]** KeyT type
736
+ //!
737
+ //! @tparam ValueT
738
+ //! **[inferred]** ValueT type
739
+ //!
740
+ //! @tparam NumItemsT
741
+ //! **[inferred]** Type of num_items
742
+ //!
743
+ //! @param[in] d_temp_storage
744
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
745
+ //! required allocation size is written to ``temp_storage_bytes`` and no work is done.
746
+ //!
747
+ //! @param[in,out] temp_storage_bytes
748
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
749
+ //!
750
+ //! @param[in,out] d_keys
751
+ //! Reference to the double-buffer of keys whose "current" device-accessible
752
+ //! buffer contains the unsorted input keys and, upon return, is updated to
753
+ //! point to the sorted output keys
754
+ //!
755
+ //! @param[in,out] d_values
756
+ //! Double-buffer of values whose "current" device-accessible buffer
757
+ //! contains the unsorted input values and, upon return, is updated to point
758
+ //! to the sorted output values
759
+ //!
760
+ //! @param[in] num_items
761
+ //! Number of items to sort
762
+ //!
763
+ //! @param[in] begin_bit
764
+ //! **[optional]** The least-significant bit index (inclusive) needed for
765
+ //! key comparison
766
+ //!
767
+ //! @param[in] end_bit
768
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
769
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
770
+ //!
771
+ //! @param[in] stream
772
+ //! **[optional]** CUDA stream to launch kernels within.
773
+ //! Default is stream<sub>0</sub>.
774
+ template <typename KeyT, typename ValueT, typename NumItemsT>
775
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
776
+ void* d_temp_storage,
777
+ size_t& temp_storage_bytes,
778
+ DoubleBuffer<KeyT>& d_keys,
779
+ DoubleBuffer<ValueT>& d_values,
780
+ NumItemsT num_items,
781
+ int begin_bit = 0,
782
+ int end_bit = sizeof(KeyT) * 8,
783
+ cudaStream_t stream = 0)
784
+ {
785
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
786
+
787
+ // Unsigned integer type for global offsets.
788
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
789
+
790
+ constexpr bool is_overwrite_okay = true;
791
+
792
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
793
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
794
+ }
795
+
796
+ //! @rst
797
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
798
+ //!
799
+ //! * The sorting operation is given a pair of key buffers and a corresponding
800
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
801
+ //! structure that indicates which of the two buffers is "current" (and thus
802
+ //! contains the input data to be sorted).
803
+ //! * The contents of both buffers within each pair may be altered by the
804
+ //! sorting operation.
805
+ //! * In-place operations are not supported. There must be no overlap between
806
+ //! any of the provided ranges:
807
+ //!
808
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
809
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
810
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
811
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
812
+ //!
813
+ //! - Upon completion, the sorting operation will update the "current"
814
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
815
+ //! buffers now contains the sorted output sequence (a function of the
816
+ //! number of key bits specified and the targeted device architecture).
817
+ //! - @devicestorageP
818
+ //! - @devicestorage
819
+ //!
820
+ //! Snippet
821
+ //! --------------------------------------------------
822
+ //!
823
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
824
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
825
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
826
+ //! tuple of references to relevant members of the key.
827
+ //!
828
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
829
+ //! :language: c++
830
+ //! :dedent:
831
+ //! :start-after: example-begin custom-type
832
+ //! :end-before: example-end custom-type
833
+ //!
834
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
835
+ //! using ``cub::DeviceRadixSort::SortPairs``:
836
+ //!
837
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
838
+ //! :language: c++
839
+ //! :dedent:
840
+ //! :start-after: example-begin pairs-db
841
+ //! :end-before: example-end pairs-db
842
+ //!
843
+ //! @endrst
844
+ //!
845
+ //! @tparam KeyT
846
+ //! **[inferred]** KeyT type
847
+ //!
848
+ //! @tparam ValueT
849
+ //! **[inferred]** ValueT type
850
+ //!
851
+ //! @tparam NumItemsT
852
+ //! **[inferred]** Type of num_items
853
+ //!
854
+ //! @tparam DecomposerT
855
+ //! **[inferred]** Type of a callable object responsible for decomposing a
856
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
857
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
858
+ //! The leftmost element of the tuple is considered the most significant.
859
+ //! The call operator must not modify members of the key.
860
+ //!
861
+ //! @param[in] d_temp_storage
862
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
863
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
864
+ //! is done.
865
+ //!
866
+ //! @param[in,out] temp_storage_bytes
867
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
868
+ //!
869
+ //! @param[in,out] d_keys
870
+ //! Reference to the double-buffer of keys whose "current" device-accessible
871
+ //! buffer contains the unsorted input keys and, upon return, is updated to
872
+ //! point to the sorted output keys
873
+ //!
874
+ //! @param[in,out] d_values
875
+ //! Double-buffer of values whose "current" device-accessible buffer
876
+ //! contains the unsorted input values and, upon return, is updated to point
877
+ //! to the sorted output values
878
+ //!
879
+ //! @param[in] num_items
880
+ //! Number of items to sort
881
+ //!
882
+ //! @param decomposer
883
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
884
+ //! references to its constituent arithmetic types. The leftmost element of
885
+ //! the tuple is considered the most significant. The call operator must not
886
+ //! modify members of the key.
887
+ //!
888
+ //! @param[in] stream
889
+ //! **[optional]** CUDA stream to launch kernels within.
890
+ //! Default is stream<sub>0</sub>.
891
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
892
+ CUB_RUNTIME_FUNCTION static //
893
+ ::cuda::std::enable_if_t< //
894
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
895
+ cudaError_t>
896
+ SortPairs(void* d_temp_storage,
897
+ size_t& temp_storage_bytes,
898
+ DoubleBuffer<KeyT>& d_keys,
899
+ DoubleBuffer<ValueT>& d_values,
900
+ NumItemsT num_items,
901
+ DecomposerT decomposer,
902
+ cudaStream_t stream = 0)
903
+ {
904
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
905
+
906
+ // unsigned integer type for global offsets
907
+ using offset_t = detail::choose_offset_t<NumItemsT>;
908
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
909
+
910
+ static_assert(decomposer_check_t::value,
911
+ "DecomposerT must be a callable object returning a tuple of references to "
912
+ "arithmetic types");
913
+
914
+ constexpr bool is_overwrite_okay = true;
915
+
916
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
917
+ decomposer_check_t{},
918
+ d_temp_storage,
919
+ temp_storage_bytes,
920
+ is_overwrite_okay,
921
+ d_keys,
922
+ d_values,
923
+ static_cast<offset_t>(num_items),
924
+ decomposer,
925
+ stream);
926
+ }
927
+
928
+ //! @rst
929
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
930
+ //!
931
+ //! * The sorting operation is given a pair of key buffers and a corresponding
932
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
933
+ //! structure that indicates which of the two buffers is "current" (and thus
934
+ //! contains the input data to be sorted).
935
+ //! * The contents of both buffers within each pair may be altered by the
936
+ //! sorting operation.
937
+ //! * In-place operations are not supported. There must be no overlap between
938
+ //! any of the provided ranges:
939
+ //!
940
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
941
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
942
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
943
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
944
+ //!
945
+ //! - Upon completion, the sorting operation will update the "current"
946
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
947
+ //! buffers now contains the sorted output sequence (a function of the
948
+ //! number of key bits specified and the targeted device architecture).
949
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
950
+ //! bits can be specified. This can reduce overall sorting overhead and
951
+ //! yield a corresponding performance improvement.
952
+ //! - @devicestorageP
953
+ //! - @devicestorage
954
+ //!
955
+ //! Snippet
956
+ //! --------------------------------------------------
957
+ //!
958
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
959
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
960
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
961
+ //! tuple of references to relevant members of the key.
962
+ //!
963
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
964
+ //! :language: c++
965
+ //! :dedent:
966
+ //! :start-after: example-begin custom-type
967
+ //! :end-before: example-end custom-type
968
+ //!
969
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
970
+ //! using ``cub::DeviceRadixSort::SortPairs``:
971
+ //!
972
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
973
+ //! :language: c++
974
+ //! :dedent:
975
+ //! :start-after: example-begin pairs-bits-db
976
+ //! :end-before: example-end pairs-bits-db
977
+ //!
978
+ //! @endrst
979
+ //!
980
+ //! @tparam KeyT
981
+ //! **[inferred]** KeyT type
982
+ //!
983
+ //! @tparam ValueT
984
+ //! **[inferred]** ValueT type
985
+ //!
986
+ //! @tparam NumItemsT
987
+ //! **[inferred]** Type of num_items
988
+ //!
989
+ //! @tparam DecomposerT
990
+ //! **[inferred]** Type of a callable object responsible for decomposing a
991
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
992
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
993
+ //! The leftmost element of the tuple is considered the most significant.
994
+ //! The call operator must not modify members of the key.
995
+ //!
996
+ //! @param[in] d_temp_storage
997
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
998
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
999
+ //! is done.
1000
+ //!
1001
+ //! @param[in,out] temp_storage_bytes
1002
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1003
+ //!
1004
+ //! @param[in,out] d_keys
1005
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1006
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1007
+ //! point to the sorted output keys
1008
+ //!
1009
+ //! @param[in,out] d_values
1010
+ //! Double-buffer of values whose "current" device-accessible buffer
1011
+ //! contains the unsorted input values and, upon return, is updated to point
1012
+ //! to the sorted output values
1013
+ //!
1014
+ //! @param[in] num_items
1015
+ //! Number of items to sort
1016
+ //!
1017
+ //! @param decomposer
1018
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1019
+ //! references to its constituent arithmetic types. The leftmost element of
1020
+ //! the tuple is considered the most significant. The call operator must not
1021
+ //! modify members of the key.
1022
+ //!
1023
+ //! @param[in] begin_bit
1024
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1025
+ //! key comparison
1026
+ //!
1027
+ //! @param[in] end_bit
1028
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1029
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1030
+ //!
1031
+ //! @param[in] stream
1032
+ //! **[optional]** CUDA stream to launch kernels within.
1033
+ //! Default is stream<sub>0</sub>.
1034
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1035
+ CUB_RUNTIME_FUNCTION static //
1036
+ ::cuda::std::enable_if_t< //
1037
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1038
+ cudaError_t>
1039
+ SortPairs(void* d_temp_storage,
1040
+ size_t& temp_storage_bytes,
1041
+ DoubleBuffer<KeyT>& d_keys,
1042
+ DoubleBuffer<ValueT>& d_values,
1043
+ NumItemsT num_items,
1044
+ DecomposerT decomposer,
1045
+ int begin_bit,
1046
+ int end_bit,
1047
+ cudaStream_t stream = 0)
1048
+ {
1049
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1050
+
1051
+ // unsigned integer type for global offsets
1052
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1053
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1054
+
1055
+ static_assert(decomposer_check_t::value,
1056
+ "DecomposerT must be a callable object returning a tuple of references to "
1057
+ "arithmetic types");
1058
+
1059
+ constexpr bool is_overwrite_okay = true;
1060
+
1061
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
1062
+ decomposer_check_t{},
1063
+ d_temp_storage,
1064
+ temp_storage_bytes,
1065
+ is_overwrite_okay,
1066
+ d_keys,
1067
+ d_values,
1068
+ static_cast<offset_t>(num_items),
1069
+ decomposer,
1070
+ begin_bit,
1071
+ end_bit,
1072
+ stream);
1073
+ }
1074
+
1075
+ //! @rst
1076
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1077
+ //!
1078
+ //! - The contents of the input data are not altered by the sorting operation.
1079
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1080
+ //! supported.
1081
+ //! - In-place operations are not supported. There must be no overlap between
1082
+ //! any of the provided ranges:
1083
+ //!
1084
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1085
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1086
+ //! - ``[d_values_in, d_values_in + num_items)``
1087
+ //! - ``[d_values_out, d_values_out + num_items)``
1088
+ //!
1089
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1090
+ //! bits can be specified. This can reduce overall sorting overhead and
1091
+ //! yield a corresponding performance improvement.
1092
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1093
+ //! the sorting interface using DoubleBuffer wrappers below.
1094
+ //! - @devicestorage
1095
+ //!
1096
+ //! Snippet
1097
+ //! --------------------------------------------------
1098
+ //!
1099
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1100
+ //! keys with associated vector of ``int`` values.
1101
+ //! @endrst
1102
+ //!
1103
+ //! @code{.cpp}
1104
+ //! #include <cub/cub.cuh>
1105
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1106
+ //!
1107
+ //! // Declare, allocate, and initialize device-accessible pointers
1108
+ //! // for sorting data
1109
+ //! int num_items; // e.g., 7
1110
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1111
+ //! int *d_keys_out; // e.g., [ ... ]
1112
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1113
+ //! int *d_values_out; // e.g., [ ... ]
1114
+ //! ...
1115
+ //!
1116
+ //! // Determine temporary device storage requirements
1117
+ //! void *d_temp_storage = nullptr;
1118
+ //! size_t temp_storage_bytes = 0;
1119
+ //! cub::DeviceRadixSort::SortPairsDescending(
1120
+ //! d_temp_storage, temp_storage_bytes,
1121
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1122
+ //!
1123
+ //! // Allocate temporary storage
1124
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1125
+ //!
1126
+ //! // Run sorting operation
1127
+ //! cub::DeviceRadixSort::SortPairsDescending(
1128
+ //! d_temp_storage, temp_storage_bytes,
1129
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1130
+ //!
1131
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
1132
+ //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
1133
+ //! @endcode
1134
+ //!
1135
+ //! @tparam KeyT
1136
+ //! **[inferred]** KeyT type
1137
+ //!
1138
+ //! @tparam ValueT
1139
+ //! **[inferred]** ValueT type
1140
+ //!
1141
+ //! @tparam NumItemsT
1142
+ //! **[inferred]** Type of num_items
1143
+ //!
1144
+ //! @param[in] d_temp_storage
1145
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1146
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1147
+ //! is done.
1148
+ //!
1149
+ //! @param[in,out] temp_storage_bytes
1150
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1151
+ //!
1152
+ //! @param[in] d_keys_in
1153
+ //! Pointer to the input data of key data to sort
1154
+ //!
1155
+ //! @param[out] d_keys_out
1156
+ //! Pointer to the sorted output sequence of key data
1157
+ //!
1158
+ //! @param[in] d_values_in
1159
+ //! Pointer to the corresponding input sequence of associated value items
1160
+ //!
1161
+ //! @param[out] d_values_out
1162
+ //! Pointer to the correspondingly-reordered output sequence of associated
1163
+ //! value items
1164
+ //!
1165
+ //! @param[in] num_items
1166
+ //! Number of items to sort
1167
+ //!
1168
+ //! @param[in] begin_bit
1169
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1170
+ //! key comparison
1171
+ //!
1172
+ //! @param[in] end_bit
1173
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1174
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1175
+ //!
1176
+ //! @param[in] stream
1177
+ //! **[optional]** CUDA stream to launch kernels within.
1178
+ //! Default is stream<sub>0</sub>.
1179
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1180
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1181
+ void* d_temp_storage,
1182
+ size_t& temp_storage_bytes,
1183
+ const KeyT* d_keys_in,
1184
+ KeyT* d_keys_out,
1185
+ const ValueT* d_values_in,
1186
+ ValueT* d_values_out,
1187
+ NumItemsT num_items,
1188
+ int begin_bit = 0,
1189
+ int end_bit = sizeof(KeyT) * 8,
1190
+ cudaStream_t stream = 0)
1191
+ {
1192
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1193
+
1194
+ // Unsigned integer type for global offsets.
1195
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1196
+
1197
+ // We cast away const-ness, but will *not* write to these arrays.
1198
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1199
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1200
+ // is not set.
1201
+ constexpr bool is_overwrite_okay = false;
1202
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1203
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1204
+
1205
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1206
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1207
+ }
1208
+
1209
+ //! @rst
1210
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1211
+ //!
1212
+ //! * The contents of the input data are not altered by the sorting operation.
1213
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1214
+ //! supported.
1215
+ //! * In-place operations are not supported. There must be no overlap between
1216
+ //! any of the provided ranges:
1217
+ //!
1218
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1219
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1220
+ //! * ``[d_values_in, d_values_in + num_items)``
1221
+ //! * ``[d_values_out, d_values_out + num_items)``
1222
+ //!
1223
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
1224
+ //! differentiating key bits. This can reduce overall sorting overhead and
1225
+ //! yield a corresponding performance improvement.
1226
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1227
+ //! the sorting interface using DoubleBuffer wrappers below.
1228
+ //! * @devicestorage
1229
+ //!
1230
+ //! Snippet
1231
+ //! --------------------------------------------------
1232
+ //!
1233
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1234
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1235
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1236
+ //! tuple of references to relevant members of the key.
1237
+ //!
1238
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1239
+ //! :language: c++
1240
+ //! :dedent:
1241
+ //! :start-after: example-begin custom-type
1242
+ //! :end-before: example-end custom-type
1243
+ //!
1244
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1245
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1246
+ //!
1247
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1248
+ //! :language: c++
1249
+ //! :dedent:
1250
+ //! :start-after: example-begin pairs-descending-bits
1251
+ //! :end-before: example-end pairs-descending-bits
1252
+ //!
1253
+ //! @endrst
1254
+ //!
1255
+ //! @tparam KeyT
1256
+ //! **[inferred]** KeyT type
1257
+ //!
1258
+ //! @tparam ValueT
1259
+ //! **[inferred]** ValueT type
1260
+ //!
1261
+ //! @tparam NumItemsT
1262
+ //! **[inferred]** Type of num_items
1263
+ //!
1264
+ //! @tparam DecomposerT
1265
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1266
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1267
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1268
+ //! The leftmost element of the tuple is considered the most significant.
1269
+ //! The call operator must not modify members of the key.
1270
+ //!
1271
+ //! @param[in] d_temp_storage
1272
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1273
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1274
+ //! is done.
1275
+ //!
1276
+ //! @param[in,out] temp_storage_bytes
1277
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1278
+ //!
1279
+ //! @param[in] d_keys_in
1280
+ //! Pointer to the input data of key data to sort
1281
+ //!
1282
+ //! @param[out] d_keys_out
1283
+ //! Pointer to the sorted output sequence of key data
1284
+ //!
1285
+ //! @param[in] d_values_in
1286
+ //! Pointer to the corresponding input sequence of associated value items
1287
+ //!
1288
+ //! @param[out] d_values_out
1289
+ //! Pointer to the correspondingly-reordered output sequence of associated
1290
+ //! value items
1291
+ //!
1292
+ //! @param[in] num_items
1293
+ //! Number of items to sort
1294
+ //!
1295
+ //! @param decomposer
1296
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1297
+ //! references to its constituent arithmetic types. The leftmost element of
1298
+ //! the tuple is considered the most significant. The call operator must not
1299
+ //! modify members of the key.
1300
+ //!
1301
+ //! @param[in] begin_bit
1302
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1303
+ //! key comparison
1304
+ //!
1305
+ //! @param[in] end_bit
1306
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1307
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1308
+ //!
1309
+ //! @param[in] stream
1310
+ //! **[optional]** CUDA stream to launch kernels within.
1311
+ //! Default is stream<sub>0</sub>.
1312
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1313
+ CUB_RUNTIME_FUNCTION static //
1314
+ ::cuda::std::enable_if_t< //
1315
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1316
+ cudaError_t>
1317
+ SortPairsDescending(
1318
+ void* d_temp_storage,
1319
+ size_t& temp_storage_bytes,
1320
+ const KeyT* d_keys_in,
1321
+ KeyT* d_keys_out,
1322
+ const ValueT* d_values_in,
1323
+ ValueT* d_values_out,
1324
+ NumItemsT num_items,
1325
+ DecomposerT decomposer,
1326
+ int begin_bit,
1327
+ int end_bit,
1328
+ cudaStream_t stream = 0)
1329
+ {
1330
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1331
+
1332
+ // unsigned integer type for global offsets
1333
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1334
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1335
+
1336
+ static_assert(decomposer_check_t::value,
1337
+ "DecomposerT must be a callable object returning a tuple of references to "
1338
+ "arithmetic types");
1339
+
1340
+ // We cast away const-ness, but will *not* write to these arrays.
1341
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1342
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1343
+ // is not set.
1344
+ constexpr bool is_overwrite_okay = false;
1345
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1346
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1347
+
1348
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1349
+ decomposer_check_t{},
1350
+ d_temp_storage,
1351
+ temp_storage_bytes,
1352
+ is_overwrite_okay,
1353
+ d_keys,
1354
+ d_values,
1355
+ static_cast<offset_t>(num_items),
1356
+ decomposer,
1357
+ begin_bit,
1358
+ end_bit,
1359
+ stream);
1360
+ }
1361
+
1362
+ //! @rst
1363
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1364
+ //!
1365
+ //! * The contents of the input data are not altered by the sorting operation.
1366
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1367
+ //! supported.
1368
+ //! * In-place operations are not supported. There must be no overlap between
1369
+ //! any of the provided ranges:
1370
+ //!
1371
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1372
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1373
+ //! * ``[d_values_in, d_values_in + num_items)``
1374
+ //! * ``[d_values_out, d_values_out + num_items)``
1375
+ //!
1376
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1377
+ //! the sorting interface using DoubleBuffer wrappers below.
1378
+ //! * @devicestorage
1379
+ //!
1380
+ //! Snippet
1381
+ //! --------------------------------------------------
1382
+ //!
1383
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1384
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1385
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1386
+ //! tuple of references to relevant members of the key.
1387
+ //!
1388
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1389
+ //! :language: c++
1390
+ //! :dedent:
1391
+ //! :start-after: example-begin custom-type
1392
+ //! :end-before: example-end custom-type
1393
+ //!
1394
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1395
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1396
+ //!
1397
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1398
+ //! :language: c++
1399
+ //! :dedent:
1400
+ //! :start-after: example-begin pairs-descending
1401
+ //! :end-before: example-end pairs-descending
1402
+ //!
1403
+ //! @endrst
1404
+ //!
1405
+ //! @tparam KeyT
1406
+ //! **[inferred]** KeyT type
1407
+ //!
1408
+ //! @tparam ValueT
1409
+ //! **[inferred]** ValueT type
1410
+ //!
1411
+ //! @tparam NumItemsT
1412
+ //! **[inferred]** Type of num_items
1413
+ //!
1414
+ //! @tparam DecomposerT
1415
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1416
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1417
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1418
+ //! The leftmost element of the tuple is considered the most significant.
1419
+ //! The call operator must not modify members of the key.
1420
+ //!
1421
+ //! @param[in] d_temp_storage
1422
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1423
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1424
+ //! is done.
1425
+ //!
1426
+ //! @param[in,out] temp_storage_bytes
1427
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1428
+ //!
1429
+ //! @param[in] d_keys_in
1430
+ //! Pointer to the input data of key data to sort
1431
+ //!
1432
+ //! @param[out] d_keys_out
1433
+ //! Pointer to the sorted output sequence of key data
1434
+ //!
1435
+ //! @param[in] d_values_in
1436
+ //! Pointer to the corresponding input sequence of associated value items
1437
+ //!
1438
+ //! @param[out] d_values_out
1439
+ //! Pointer to the correspondingly-reordered output sequence of associated
1440
+ //! value items
1441
+ //!
1442
+ //! @param[in] num_items
1443
+ //! Number of items to sort
1444
+ //!
1445
+ //! @param decomposer
1446
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1447
+ //! references to its constituent arithmetic types. The leftmost element of
1448
+ //! the tuple is considered the most significant. The call operator must not
1449
+ //! modify members of the key.
1450
+ //!
1451
+ //! @param[in] stream
1452
+ //! **[optional]** CUDA stream to launch kernels within.
1453
+ //! Default is stream<sub>0</sub>.
1454
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1455
+ CUB_RUNTIME_FUNCTION static //
1456
+ ::cuda::std::enable_if_t< //
1457
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1458
+ cudaError_t>
1459
+ SortPairsDescending(
1460
+ void* d_temp_storage,
1461
+ size_t& temp_storage_bytes,
1462
+ const KeyT* d_keys_in,
1463
+ KeyT* d_keys_out,
1464
+ const ValueT* d_values_in,
1465
+ ValueT* d_values_out,
1466
+ NumItemsT num_items,
1467
+ DecomposerT decomposer,
1468
+ cudaStream_t stream = 0)
1469
+ {
1470
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1471
+
1472
+ // unsigned integer type for global offsets
1473
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1474
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1475
+
1476
+ static_assert(decomposer_check_t::value,
1477
+ "DecomposerT must be a callable object returning a tuple of references to "
1478
+ "arithmetic types");
1479
+
1480
+ // We cast away const-ness, but will *not* write to these arrays.
1481
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1482
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1483
+ // is not set.
1484
+ constexpr bool is_overwrite_okay = false;
1485
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1486
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1487
+
1488
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1489
+ decomposer_check_t{},
1490
+ d_temp_storage,
1491
+ temp_storage_bytes,
1492
+ is_overwrite_okay,
1493
+ d_keys,
1494
+ d_values,
1495
+ static_cast<offset_t>(num_items),
1496
+ decomposer,
1497
+ stream);
1498
+ }
1499
+
1500
+ //! @rst
1501
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1502
+ //!
1503
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1504
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1505
+ //! structure that indicates which of the two buffers is "current" (and thus
1506
+ //! contains the input data to be sorted).
1507
+ //! - The contents of both buffers within each pair may be altered by the
1508
+ //! sorting operation.
1509
+ //! - In-place operations are not supported. There must be no overlap between
1510
+ //! any of the provided ranges:
1511
+ //!
1512
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1513
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1514
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1515
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1516
+ //!
1517
+ //! - Upon completion, the sorting operation will update the "current"
1518
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1519
+ //! buffers now contains the sorted output sequence (a function of the number
1520
+ //! of key bits specified and the targeted device architecture).
1521
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1522
+ //! bits can be specified. This can reduce overall sorting overhead and
1523
+ //! yield a corresponding performance improvement.
1524
+ //! - @devicestorageP
1525
+ //! - @devicestorage
1526
+ //!
1527
+ //! Snippet
1528
+ //! --------------------------------------------------
1529
+ //!
1530
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1531
+ //! keys with associated vector of ``int`` values.
1532
+ //! @endrst
1533
+ //!
1534
+ //! @code{.cpp}
1535
+ //! #include <cub/cub.cuh>
1536
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1537
+ //!
1538
+ //! // Declare, allocate, and initialize device-accessible pointers
1539
+ //! // for sorting data
1540
+ //! int num_items; // e.g., 7
1541
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1542
+ //! int *d_key_alt_buf; // e.g., [ ... ]
1543
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1544
+ //! int *d_value_alt_buf; // e.g., [ ... ]
1545
+ //! ...
1546
+ //!
1547
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1548
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1549
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1550
+ //!
1551
+ //! // Determine temporary device storage requirements
1552
+ //! void *d_temp_storage = nullptr;
1553
+ //! size_t temp_storage_bytes = 0;
1554
+ //! cub::DeviceRadixSort::SortPairsDescending(
1555
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1556
+ //!
1557
+ //! // Allocate temporary storage
1558
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1559
+ //!
1560
+ //! // Run sorting operation
1561
+ //! cub::DeviceRadixSort::SortPairsDescending(
1562
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1563
+ //!
1564
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
1565
+ //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
1566
+ //! @endcode
1567
+ //!
1568
+ //! @tparam KeyT
1569
+ //! **[inferred]** KeyT type
1570
+ //!
1571
+ //! @tparam ValueT
1572
+ //! **[inferred]** ValueT type
1573
+ //!
1574
+ //! @tparam NumItemsT
1575
+ //! **[inferred]** Type of num_items
1576
+ //!
1577
+ //! @param[in] d_temp_storage
1578
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1579
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1580
+ //! is done.
1581
+ //!
1582
+ //! @param[in,out] temp_storage_bytes
1583
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1584
+ //!
1585
+ //! @param[in,out] d_keys
1586
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1587
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1588
+ //! point to the sorted output keys
1589
+ //!
1590
+ //! @param[in,out] d_values
1591
+ //! Double-buffer of values whose "current" device-accessible buffer
1592
+ //! contains the unsorted input values and, upon return, is updated to point
1593
+ //! to the sorted output values
1594
+ //!
1595
+ //! @param[in] num_items
1596
+ //! Number of items to sort
1597
+ //!
1598
+ //! @param[in] begin_bit
1599
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1600
+ //! key comparison
1601
+ //!
1602
+ //! @param[in] end_bit
1603
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1604
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1605
+ //!
1606
+ //! @param[in] stream
1607
+ //! **[optional]** CUDA stream to launch kernels within.
1608
+ //! Default is stream<sub>0</sub>.
1609
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1610
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1611
+ void* d_temp_storage,
1612
+ size_t& temp_storage_bytes,
1613
+ DoubleBuffer<KeyT>& d_keys,
1614
+ DoubleBuffer<ValueT>& d_values,
1615
+ NumItemsT num_items,
1616
+ int begin_bit = 0,
1617
+ int end_bit = sizeof(KeyT) * 8,
1618
+ cudaStream_t stream = 0)
1619
+ {
1620
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1621
+
1622
+ // Unsigned integer type for global offsets.
1623
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1624
+
1625
+ constexpr bool is_overwrite_okay = true;
1626
+
1627
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1628
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1629
+ }
1630
+
1631
+ //! @rst
1632
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1633
+ //!
1634
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1635
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1636
+ //! structure that indicates which of the two buffers is "current" (and thus
1637
+ //! contains the input data to be sorted).
1638
+ //! * The contents of both buffers within each pair may be altered by the
1639
+ //! sorting operation.
1640
+ //! * In-place operations are not supported. There must be no overlap between
1641
+ //! any of the provided ranges:
1642
+ //!
1643
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1644
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1645
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1646
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1647
+ //!
1648
+ //! - Upon completion, the sorting operation will update the "current"
1649
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1650
+ //! buffers now contains the sorted output sequence (a function of the
1651
+ //! number of key bits specified and the targeted device architecture).
1652
+ //! - @devicestorageP
1653
+ //! - @devicestorage
1654
+ //!
1655
+ //! Snippet
1656
+ //! --------------------------------------------------
1657
+ //!
1658
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1659
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1660
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1661
+ //! tuple of references to relevant members of the key.
1662
+ //!
1663
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1664
+ //! :language: c++
1665
+ //! :dedent:
1666
+ //! :start-after: example-begin custom-type
1667
+ //! :end-before: example-end custom-type
1668
+ //!
1669
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1670
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1671
+ //!
1672
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1673
+ //! :language: c++
1674
+ //! :dedent:
1675
+ //! :start-after: example-begin pairs-descending-db
1676
+ //! :end-before: example-end pairs-descending-db
1677
+ //!
1678
+ //! @endrst
1679
+ //!
1680
+ //! @tparam KeyT
1681
+ //! **[inferred]** KeyT type
1682
+ //!
1683
+ //! @tparam ValueT
1684
+ //! **[inferred]** ValueT type
1685
+ //!
1686
+ //! @tparam NumItemsT
1687
+ //! **[inferred]** Type of num_items
1688
+ //!
1689
+ //! @tparam DecomposerT
1690
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1691
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1692
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1693
+ //! The leftmost element of the tuple is considered the most significant.
1694
+ //! The call operator must not modify members of the key.
1695
+ //!
1696
+ //! @param[in] d_temp_storage
1697
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1698
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1699
+ //! is done.
1700
+ //!
1701
+ //! @param[in,out] temp_storage_bytes
1702
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1703
+ //!
1704
+ //! @param[in,out] d_keys
1705
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1706
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1707
+ //! point to the sorted output keys
1708
+ //!
1709
+ //! @param[in,out] d_values
1710
+ //! Double-buffer of values whose "current" device-accessible buffer
1711
+ //! contains the unsorted input values and, upon return, is updated to point
1712
+ //! to the sorted output values
1713
+ //!
1714
+ //! @param[in] num_items
1715
+ //! Number of items to sort
1716
+ //!
1717
+ //! @param decomposer
1718
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1719
+ //! references to its constituent arithmetic types. The leftmost element of
1720
+ //! the tuple is considered the most significant. The call operator must not
1721
+ //! modify members of the key.
1722
+ //!
1723
+ //! @param[in] stream
1724
+ //! **[optional]** CUDA stream to launch kernels within.
1725
+ //! Default is stream<sub>0</sub>.
1726
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1727
+ CUB_RUNTIME_FUNCTION static //
1728
+ ::cuda::std::enable_if_t< //
1729
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1730
+ cudaError_t>
1731
+ SortPairsDescending(
1732
+ void* d_temp_storage,
1733
+ size_t& temp_storage_bytes,
1734
+ DoubleBuffer<KeyT>& d_keys,
1735
+ DoubleBuffer<ValueT>& d_values,
1736
+ NumItemsT num_items,
1737
+ DecomposerT decomposer,
1738
+ cudaStream_t stream = 0)
1739
+ {
1740
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1741
+
1742
+ // unsigned integer type for global offsets
1743
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1744
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1745
+
1746
+ static_assert(decomposer_check_t::value,
1747
+ "DecomposerT must be a callable object returning a tuple of references to "
1748
+ "arithmetic types");
1749
+
1750
+ constexpr bool is_overwrite_okay = true;
1751
+
1752
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1753
+ decomposer_check_t{},
1754
+ d_temp_storage,
1755
+ temp_storage_bytes,
1756
+ is_overwrite_okay,
1757
+ d_keys,
1758
+ d_values,
1759
+ static_cast<offset_t>(num_items),
1760
+ decomposer,
1761
+ stream);
1762
+ }
1763
+
1764
+ //! @rst
1765
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1766
+ //!
1767
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1768
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1769
+ //! structure that indicates which of the two buffers is "current" (and thus
1770
+ //! contains the input data to be sorted).
1771
+ //! * The contents of both buffers within each pair may be altered by the
1772
+ //! sorting operation.
1773
+ //! * In-place operations are not supported. There must be no overlap between
1774
+ //! any of the provided ranges:
1775
+ //!
1776
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1777
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1778
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1779
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1780
+ //!
1781
+ //! - Upon completion, the sorting operation will update the "current"
1782
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1783
+ //! buffers now contains the sorted output sequence (a function of the
1784
+ //! number of key bits specified and the targeted device architecture).
1785
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1786
+ //! bits can be specified. This can reduce overall sorting overhead and
1787
+ //! yield a corresponding performance improvement.
1788
+ //! - @devicestorageP
1789
+ //! - @devicestorage
1790
+ //!
1791
+ //! Snippet
1792
+ //! --------------------------------------------------
1793
+ //!
1794
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1795
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1796
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1797
+ //! tuple of references to relevant members of the key.
1798
+ //!
1799
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1800
+ //! :language: c++
1801
+ //! :dedent:
1802
+ //! :start-after: example-begin custom-type
1803
+ //! :end-before: example-end custom-type
1804
+ //!
1805
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1806
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1807
+ //!
1808
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1809
+ //! :language: c++
1810
+ //! :dedent:
1811
+ //! :start-after: example-begin pairs-descending-bits-db
1812
+ //! :end-before: example-end pairs-descending-bits-db
1813
+ //!
1814
+ //! @endrst
1815
+ //!
1816
+ //! @tparam KeyT
1817
+ //! **[inferred]** KeyT type
1818
+ //!
1819
+ //! @tparam ValueT
1820
+ //! **[inferred]** ValueT type
1821
+ //!
1822
+ //! @tparam NumItemsT
1823
+ //! **[inferred]** Type of num_items
1824
+ //!
1825
+ //! @tparam DecomposerT
1826
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1827
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1828
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1829
+ //! The leftmost element of the tuple is considered the most significant.
1830
+ //! The call operator must not modify members of the key.
1831
+ //!
1832
+ //! @param[in] d_temp_storage
1833
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1834
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1835
+ //! is done.
1836
+ //!
1837
+ //! @param[in,out] temp_storage_bytes
1838
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1839
+ //!
1840
+ //! @param[in,out] d_keys
1841
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1842
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1843
+ //! point to the sorted output keys
1844
+ //!
1845
+ //! @param[in,out] d_values
1846
+ //! Double-buffer of values whose "current" device-accessible buffer
1847
+ //! contains the unsorted input values and, upon return, is updated to point
1848
+ //! to the sorted output values
1849
+ //!
1850
+ //! @param[in] num_items
1851
+ //! Number of items to sort
1852
+ //!
1853
+ //! @param decomposer
1854
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1855
+ //! references to its constituent arithmetic types. The leftmost element of
1856
+ //! the tuple is considered the most significant. The call operator must not
1857
+ //! modify members of the key.
1858
+ //!
1859
+ //! @param[in] begin_bit
1860
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1861
+ //! key comparison
1862
+ //!
1863
+ //! @param[in] end_bit
1864
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1865
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1866
+ //!
1867
+ //! @param[in] stream
1868
+ //! **[optional]** CUDA stream to launch kernels within.
1869
+ //! Default is stream<sub>0</sub>.
1870
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1871
+ CUB_RUNTIME_FUNCTION static //
1872
+ ::cuda::std::enable_if_t< //
1873
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1874
+ cudaError_t>
1875
+ SortPairsDescending(
1876
+ void* d_temp_storage,
1877
+ size_t& temp_storage_bytes,
1878
+ DoubleBuffer<KeyT>& d_keys,
1879
+ DoubleBuffer<ValueT>& d_values,
1880
+ NumItemsT num_items,
1881
+ DecomposerT decomposer,
1882
+ int begin_bit,
1883
+ int end_bit,
1884
+ cudaStream_t stream = 0)
1885
+ {
1886
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1887
+
1888
+ // unsigned integer type for global offsets
1889
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1890
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1891
+
1892
+ static_assert(decomposer_check_t::value,
1893
+ "DecomposerT must be a callable object returning a tuple of references to "
1894
+ "arithmetic types");
1895
+
1896
+ constexpr bool is_overwrite_okay = true;
1897
+
1898
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1899
+ decomposer_check_t{},
1900
+ d_temp_storage,
1901
+ temp_storage_bytes,
1902
+ is_overwrite_okay,
1903
+ d_keys,
1904
+ d_values,
1905
+ static_cast<offset_t>(num_items),
1906
+ decomposer,
1907
+ begin_bit,
1908
+ end_bit,
1909
+ stream);
1910
+ }
1911
+
1912
+ //! @} end member group
1913
+ //! @name Keys-only
1914
+ //! @{
1915
+
1916
+ //! @rst
1917
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
1918
+ //!
1919
+ //! - The contents of the input data are not altered by the sorting operation.
1920
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1921
+ //! supported.
1922
+ //! - In-place operations are not supported. There must be no overlap between
1923
+ //! any of the provided ranges:
1924
+ //!
1925
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1926
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1927
+ //!
1928
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1929
+ //! bits can be specified. This can reduce overall sorting overhead and
1930
+ //! yield a corresponding performance improvement.
1931
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1932
+ //! the sorting interface using DoubleBuffer wrappers below.
1933
+ //! - @devicestorage
1934
+ //!
1935
+ //! Snippet
1936
+ //! --------------------------------------------------
1937
+ //!
1938
+ //! The code snippet below illustrates the sorting of a device vector of
1939
+ //! ``int`` keys.
1940
+ //! @endrst
1941
+ //!
1942
+ //! @code{.cpp}
1943
+ //! #include <cub/cub.cuh>
1944
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1945
+ //!
1946
+ //! // Declare, allocate, and initialize device-accessible pointers
1947
+ //! // for sorting data
1948
+ //! int num_items; // e.g., 7
1949
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1950
+ //! int *d_keys_out; // e.g., [ ... ]
1951
+ //! ...
1952
+ //!
1953
+ //! // Determine temporary device storage requirements
1954
+ //! void *d_temp_storage = nullptr;
1955
+ //! size_t temp_storage_bytes = 0;
1956
+ //! cub::DeviceRadixSort::SortKeys(
1957
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1958
+ //!
1959
+ //! // Allocate temporary storage
1960
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1961
+ //!
1962
+ //! // Run sorting operation
1963
+ //! cub::DeviceRadixSort::SortKeys(
1964
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1965
+ //!
1966
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
1967
+ //! @endcode
1968
+ //!
1969
+ //! @tparam KeyT
1970
+ //! **[inferred]** KeyT type
1971
+ //!
1972
+ //! @tparam NumItemsT
1973
+ //! **[inferred]** Type of num_items
1974
+ //!
1975
+ //! @tparam NumItemsT
1976
+ //! **[inferred]** Type of num_items
1977
+ //!
1978
+ //! @param[in] d_temp_storage
1979
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1980
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1981
+ //! is done.
1982
+ //!
1983
+ //! @param[in,out] temp_storage_bytes
1984
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1985
+ //!
1986
+ //! @param[in] d_keys_in
1987
+ //! Pointer to the input data of key data to sort
1988
+ //!
1989
+ //! @param[out] d_keys_out
1990
+ //! Pointer to the sorted output sequence of key data
1991
+ //!
1992
+ //! @param[in] num_items
1993
+ //! Number of items to sort
1994
+ //!
1995
+ //! @param[in] begin_bit
1996
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1997
+ //! key comparison
1998
+ //!
1999
+ //! @param[in] end_bit
2000
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2001
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2002
+ //!
2003
+ //! @param[in] stream
2004
+ //! **[optional]** CUDA stream to launch kernels within.
2005
+ //! Default is stream<sub>0</sub>.
2006
+ template <typename KeyT, typename NumItemsT>
2007
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2008
+ void* d_temp_storage,
2009
+ size_t& temp_storage_bytes,
2010
+ const KeyT* d_keys_in,
2011
+ KeyT* d_keys_out,
2012
+ NumItemsT num_items,
2013
+ int begin_bit = 0,
2014
+ int end_bit = sizeof(KeyT) * 8,
2015
+ cudaStream_t stream = 0)
2016
+ {
2017
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2018
+
2019
+ // Unsigned integer type for global offsets.
2020
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2021
+
2022
+ // We cast away const-ness, but will *not* write to these arrays.
2023
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2024
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2025
+ // is not set.
2026
+ constexpr bool is_overwrite_okay = false;
2027
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2028
+ // Null value type
2029
+ DoubleBuffer<NullType> d_values;
2030
+
2031
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2032
+ d_temp_storage,
2033
+ temp_storage_bytes,
2034
+ d_keys,
2035
+ d_values,
2036
+ static_cast<OffsetT>(num_items),
2037
+ begin_bit,
2038
+ end_bit,
2039
+ is_overwrite_okay,
2040
+ stream);
2041
+ }
2042
+
2043
+ //! @rst
2044
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2045
+ //!
2046
+ //! * The contents of the input data are not altered by the sorting operation.
2047
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2048
+ //! supported.
2049
+ //! * In-place operations are not supported. There must be no overlap between
2050
+ //! any of the provided ranges:
2051
+ //!
2052
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2053
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2054
+ //!
2055
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2056
+ //! differentiating key bits. This can reduce overall sorting overhead and
2057
+ //! yield a corresponding performance improvement.
2058
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2059
+ //! the sorting interface using DoubleBuffer wrappers below.
2060
+ //! * @devicestorage
2061
+ //!
2062
+ //! Snippet
2063
+ //! --------------------------------------------------
2064
+ //!
2065
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2066
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2067
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2068
+ //! tuple of references to relevant members of the key.
2069
+ //!
2070
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2071
+ //! :language: c++
2072
+ //! :dedent:
2073
+ //! :start-after: example-begin custom-type
2074
+ //! :end-before: example-end custom-type
2075
+ //!
2076
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2077
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2078
+ //!
2079
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2080
+ //! :language: c++
2081
+ //! :dedent:
2082
+ //! :start-after: example-begin keys-bits
2083
+ //! :end-before: example-end keys-bits
2084
+ //!
2085
+ //! @endrst
2086
+ //!
2087
+ //! @tparam KeyT
2088
+ //! **[inferred]** KeyT type
2089
+ //!
2090
+ //! @tparam NumItemsT
2091
+ //! **[inferred]** Type of num_items
2092
+ //!
2093
+ //! @tparam DecomposerT
2094
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2095
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2096
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2097
+ //! The leftmost element of the tuple is considered the most significant.
2098
+ //! The call operator must not modify members of the key.
2099
+ //!
2100
+ //! @param[in] d_temp_storage
2101
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2102
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2103
+ //! is done.
2104
+ //!
2105
+ //! @param[in,out] temp_storage_bytes
2106
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2107
+ //!
2108
+ //! @param[in] d_keys_in
2109
+ //! Pointer to the input data of key data to sort
2110
+ //!
2111
+ //! @param[out] d_keys_out
2112
+ //! Pointer to the sorted output sequence of key data
2113
+ //!
2114
+ //! @param[in] num_items
2115
+ //! Number of items to sort
2116
+ //!
2117
+ //! @param decomposer
2118
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2119
+ //! references to its constituent arithmetic types. The leftmost element of
2120
+ //! the tuple is considered the most significant. The call operator must not
2121
+ //! modify members of the key.
2122
+ //!
2123
+ //! @param[in] begin_bit
2124
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2125
+ //! key comparison
2126
+ //!
2127
+ //! @param[in] end_bit
2128
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2129
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2130
+ //!
2131
+ //! @param[in] stream
2132
+ //! **[optional]** CUDA stream to launch kernels within.
2133
+ //! Default is stream<sub>0</sub>.
2134
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2135
+ CUB_RUNTIME_FUNCTION static //
2136
+ ::cuda::std::enable_if_t< //
2137
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2138
+ cudaError_t>
2139
+ SortKeys(void* d_temp_storage,
2140
+ size_t& temp_storage_bytes,
2141
+ const KeyT* d_keys_in,
2142
+ KeyT* d_keys_out,
2143
+ NumItemsT num_items,
2144
+ DecomposerT decomposer,
2145
+ int begin_bit,
2146
+ int end_bit,
2147
+ cudaStream_t stream = 0)
2148
+ {
2149
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2150
+
2151
+ // unsigned integer type for global offsets
2152
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2153
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2154
+
2155
+ static_assert(decomposer_check_t::value,
2156
+ "DecomposerT must be a callable object returning a tuple of references to "
2157
+ "arithmetic types");
2158
+
2159
+ // We cast away const-ness, but will *not* write to these arrays.
2160
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2161
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2162
+ // is not set.
2163
+ constexpr bool is_overwrite_okay = false;
2164
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2165
+ DoubleBuffer<NullType> d_values;
2166
+
2167
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2168
+ decomposer_check_t{},
2169
+ d_temp_storage,
2170
+ temp_storage_bytes,
2171
+ is_overwrite_okay,
2172
+ d_keys,
2173
+ d_values,
2174
+ static_cast<offset_t>(num_items),
2175
+ decomposer,
2176
+ begin_bit,
2177
+ end_bit,
2178
+ stream);
2179
+ }
2180
+
2181
+ //! @rst
2182
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2183
+ //!
2184
+ //! * The contents of the input data are not altered by the sorting operation.
2185
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2186
+ //! supported.
2187
+ //! * In-place operations are not supported. There must be no overlap between
2188
+ //! any of the provided ranges:
2189
+ //!
2190
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2191
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2192
+ //!
2193
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2194
+ //! bits can be specified. This can reduce overall sorting overhead and
2195
+ //! yield a corresponding performance improvement.
2196
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2197
+ //! the sorting interface using DoubleBuffer wrappers below.
2198
+ //! * @devicestorage
2199
+ //!
2200
+ //! Snippet
2201
+ //! --------------------------------------------------
2202
+ //!
2203
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2204
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2205
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2206
+ //! tuple of references to relevant members of the key.
2207
+ //!
2208
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2209
+ //! :language: c++
2210
+ //! :dedent:
2211
+ //! :start-after: example-begin custom-type
2212
+ //! :end-before: example-end custom-type
2213
+ //!
2214
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2215
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2216
+ //!
2217
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2218
+ //! :language: c++
2219
+ //! :dedent:
2220
+ //! :start-after: example-begin keys
2221
+ //! :end-before: example-end keys
2222
+ //!
2223
+ //! @endrst
2224
+ //!
2225
+ //! @tparam KeyT
2226
+ //! **[inferred]** KeyT type
2227
+ //!
2228
+ //! @tparam NumItemsT
2229
+ //! **[inferred]** Type of num_items
2230
+ //!
2231
+ //! @tparam DecomposerT
2232
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2233
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2234
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2235
+ //! The leftmost element of the tuple is considered the most significant.
2236
+ //! The call operator must not modify members of the key.
2237
+ //!
2238
+ //! @param[in] d_temp_storage
2239
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2240
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2241
+ //! is done.
2242
+ //!
2243
+ //! @param[in,out] temp_storage_bytes
2244
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2245
+ //!
2246
+ //! @param[in] d_keys_in
2247
+ //! Pointer to the input data of key data to sort
2248
+ //!
2249
+ //! @param[out] d_keys_out
2250
+ //! Pointer to the sorted output sequence of key data
2251
+ //!
2252
+ //! @param[in] num_items
2253
+ //! Number of items to sort
2254
+ //!
2255
+ //! @param decomposer
2256
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2257
+ //! references to its constituent arithmetic types. The leftmost element of
2258
+ //! the tuple is considered the most significant. The call operator must not
2259
+ //! modify members of the key.
2260
+ //!
2261
+ //! @param[in] stream
2262
+ //! **[optional]** CUDA stream to launch kernels within.
2263
+ //! Default is stream<sub>0</sub>.
2264
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2265
+ CUB_RUNTIME_FUNCTION static //
2266
+ ::cuda::std::enable_if_t< //
2267
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2268
+ cudaError_t>
2269
+ SortKeys(void* d_temp_storage,
2270
+ size_t& temp_storage_bytes,
2271
+ const KeyT* d_keys_in,
2272
+ KeyT* d_keys_out,
2273
+ NumItemsT num_items,
2274
+ DecomposerT decomposer,
2275
+ cudaStream_t stream = 0)
2276
+ {
2277
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2278
+
2279
+ // unsigned integer type for global offsets
2280
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2281
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2282
+
2283
+ static_assert(decomposer_check_t::value,
2284
+ "DecomposerT must be a callable object returning a tuple of references to "
2285
+ "arithmetic types");
2286
+
2287
+ // We cast away const-ness, but will *not* write to these arrays.
2288
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2289
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2290
+ // is not set.
2291
+ constexpr bool is_overwrite_okay = false;
2292
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2293
+ DoubleBuffer<NullType> d_values;
2294
+
2295
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2296
+ decomposer_check_t{},
2297
+ d_temp_storage,
2298
+ temp_storage_bytes,
2299
+ is_overwrite_okay,
2300
+ d_keys,
2301
+ d_values,
2302
+ static_cast<offset_t>(num_items),
2303
+ decomposer,
2304
+ stream);
2305
+ }
2306
+
2307
+ //! @rst
2308
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2309
+ //!
2310
+ //! - The sorting operation is given a pair of key buffers managed by a
2311
+ //! DoubleBuffer structure that indicates which of the two buffers is
2312
+ //! "current" (and thus contains the input data to be sorted).
2313
+ //! - The contents of both buffers may be altered by the sorting operation.
2314
+ //! - In-place operations are not supported. There must be no overlap between
2315
+ //! any of the provided ranges:
2316
+ //!
2317
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
2318
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2319
+ //!
2320
+ //! - Upon completion, the sorting operation will update the "current"
2321
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2322
+ //! buffers now contains the sorted output sequence (a function of the
2323
+ //! number of key bits specified and the targeted device architecture).
2324
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2325
+ //! bits can be specified. This can reduce overall sorting overhead and
2326
+ //! yield a corresponding performance improvement.
2327
+ //! - @devicestorageP
2328
+ //! - @devicestorage
2329
+ //!
2330
+ //! Snippet
2331
+ //! --------------------------------------------------
2332
+ //!
2333
+ //! The code snippet below illustrates the sorting of a device vector of
2334
+ //! ``int`` keys.
2335
+ //! @endrst
2336
+ //!
2337
+ //! @code{.cpp}
2338
+ //! #include <cub/cub.cuh>
2339
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2340
+ //!
2341
+ //! // Declare, allocate, and initialize device-accessible pointers
2342
+ //! // for sorting data
2343
+ //! int num_items; // e.g., 7
2344
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2345
+ //! int *d_key_alt_buf; // e.g., [ ... ]
2346
+ //! ...
2347
+ //!
2348
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2349
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2350
+ //!
2351
+ //! // Determine temporary device storage requirements
2352
+ //! void *d_temp_storage = nullptr;
2353
+ //! size_t temp_storage_bytes = 0;
2354
+ //! cub::DeviceRadixSort::SortKeys(
2355
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2356
+ //!
2357
+ //! // Allocate temporary storage
2358
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2359
+ //!
2360
+ //! // Run sorting operation
2361
+ //! cub::DeviceRadixSort::SortKeys(
2362
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2363
+ //!
2364
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
2365
+ //! @endcode
2366
+ //!
2367
+ //! @tparam KeyT
2368
+ //! **[inferred]** KeyT type
2369
+ //!
2370
+ //! @tparam NumItemsT
2371
+ //! **[inferred]** Type of num_items
2372
+ //!
2373
+ //! @param[in] d_temp_storage
2374
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2375
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2376
+ //! is done.
2377
+ //!
2378
+ //! @param[in,out] temp_storage_bytes
2379
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2380
+ //!
2381
+ //! @param[in,out] d_keys
2382
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2383
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2384
+ //! point to the sorted output keys
2385
+ //!
2386
+ //! @param[in] num_items
2387
+ //! Number of items to sort
2388
+ //!
2389
+ //! @param[in] begin_bit
2390
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2391
+ //! key comparison
2392
+ //!
2393
+ //! @param[in] end_bit
2394
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2395
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2396
+ //!
2397
+ //! @param[in] stream
2398
+ //! **[optional]** CUDA stream to launch kernels within.
2399
+ //! Default is stream<sub>0</sub>.
2400
+ template <typename KeyT, typename NumItemsT>
2401
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2402
+ void* d_temp_storage,
2403
+ size_t& temp_storage_bytes,
2404
+ DoubleBuffer<KeyT>& d_keys,
2405
+ NumItemsT num_items,
2406
+ int begin_bit = 0,
2407
+ int end_bit = sizeof(KeyT) * 8,
2408
+ cudaStream_t stream = 0)
2409
+ {
2410
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2411
+
2412
+ // Unsigned integer type for global offsets.
2413
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2414
+
2415
+ constexpr bool is_overwrite_okay = true;
2416
+
2417
+ // Null value type
2418
+ DoubleBuffer<NullType> d_values;
2419
+
2420
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2421
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2422
+ }
2423
+
2424
+ //! @rst
2425
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2426
+ //!
2427
+ //! * The sorting operation is given a pair of key buffers managed by a
2428
+ //! DoubleBuffer structure that indicates which of the two buffers is
2429
+ //! "current" (and thus contains the input data to be sorted).
2430
+ //! * The contents of both buffers may be altered by the sorting operation.
2431
+ //! * In-place operations are not supported. There must be no overlap between
2432
+ //! any of the provided ranges:
2433
+ //!
2434
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2435
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2436
+ //!
2437
+ //! * Upon completion, the sorting operation will update the "current"
2438
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2439
+ //! buffers now contains the sorted output sequence (a function of the
2440
+ //! number of key bits specified and the targeted device architecture).
2441
+ //! * @devicestorageP
2442
+ //! * @devicestorage
2443
+ //!
2444
+ //! Snippet
2445
+ //! --------------------------------------------------
2446
+ //!
2447
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2448
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2449
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2450
+ //! tuple of references to relevant members of the key.
2451
+ //!
2452
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2453
+ //! :language: c++
2454
+ //! :dedent:
2455
+ //! :start-after: example-begin custom-type
2456
+ //! :end-before: example-end custom-type
2457
+ //!
2458
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2459
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2460
+ //!
2461
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2462
+ //! :language: c++
2463
+ //! :dedent:
2464
+ //! :start-after: example-begin keys-db
2465
+ //! :end-before: example-end keys-db
2466
+ //!
2467
+ //! @endrst
2468
+ //!
2469
+ //! @tparam KeyT
2470
+ //! **[inferred]** KeyT type
2471
+ //!
2472
+ //! @tparam NumItemsT
2473
+ //! **[inferred]** Type of num_items
2474
+ //!
2475
+ //! @tparam DecomposerT
2476
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2477
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2478
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2479
+ //! The leftmost element of the tuple is considered the most significant.
2480
+ //! The call operator must not modify members of the key.
2481
+ //!
2482
+ //! @param[in] d_temp_storage
2483
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2484
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2485
+ //! is done.
2486
+ //!
2487
+ //! @param[in,out] temp_storage_bytes
2488
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2489
+ //!
2490
+ //! @param[in,out] d_keys
2491
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2492
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2493
+ //! point to the sorted output keys
2494
+ //!
2495
+ //! @param[in] num_items
2496
+ //! Number of items to sort
2497
+ //!
2498
+ //! @param decomposer
2499
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2500
+ //! references to its constituent arithmetic types. The leftmost element of
2501
+ //! the tuple is considered the most significant. The call operator must not
2502
+ //! modify members of the key.
2503
+ //!
2504
+ //! @param[in] stream
2505
+ //! **[optional]** CUDA stream to launch kernels within.
2506
+ //! Default is stream<sub>0</sub>.
2507
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2508
+ CUB_RUNTIME_FUNCTION static //
2509
+ ::cuda::std::enable_if_t< //
2510
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2511
+ cudaError_t>
2512
+ SortKeys(void* d_temp_storage,
2513
+ size_t& temp_storage_bytes,
2514
+ DoubleBuffer<KeyT>& d_keys,
2515
+ NumItemsT num_items,
2516
+ DecomposerT decomposer,
2517
+ cudaStream_t stream = 0)
2518
+ {
2519
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2520
+
2521
+ // unsigned integer type for global offsets
2522
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2523
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2524
+
2525
+ static_assert(decomposer_check_t::value,
2526
+ "DecomposerT must be a callable object returning a tuple of references to "
2527
+ "arithmetic types");
2528
+
2529
+ constexpr bool is_overwrite_okay = true;
2530
+ DoubleBuffer<NullType> d_values;
2531
+
2532
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2533
+ decomposer_check_t{},
2534
+ d_temp_storage,
2535
+ temp_storage_bytes,
2536
+ is_overwrite_okay,
2537
+ d_keys,
2538
+ d_values,
2539
+ static_cast<offset_t>(num_items),
2540
+ decomposer,
2541
+ stream);
2542
+ }
2543
+
2544
+ //! @rst
2545
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2546
+ //!
2547
+ //! * The sorting operation is given a pair of key buffers managed by a
2548
+ //! DoubleBuffer structure that indicates which of the two buffers is
2549
+ //! "current" (and thus contains the input data to be sorted).
2550
+ //! * The contents of both buffers may be altered by the sorting operation.
2551
+ //! * In-place operations are not supported. There must be no overlap between
2552
+ //! any of the provided ranges:
2553
+ //!
2554
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2555
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2556
+ //!
2557
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2558
+ //! differentiating key bits. This can reduce overall sorting overhead and
2559
+ //! yield a corresponding performance improvement.
2560
+ //! * Upon completion, the sorting operation will update the "current"
2561
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2562
+ //! buffers now contains the sorted output sequence (a function of the
2563
+ //! number of key bits specified and the targeted device architecture).
2564
+ //! * @devicestorageP
2565
+ //! * @devicestorage
2566
+ //!
2567
+ //! Snippet
2568
+ //! --------------------------------------------------
2569
+ //!
2570
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2571
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2572
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2573
+ //! tuple of references to relevant members of the key.
2574
+ //!
2575
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2576
+ //! :language: c++
2577
+ //! :dedent:
2578
+ //! :start-after: example-begin custom-type
2579
+ //! :end-before: example-end custom-type
2580
+ //!
2581
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2582
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2583
+ //!
2584
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2585
+ //! :language: c++
2586
+ //! :dedent:
2587
+ //! :start-after: example-begin keys-bits-db
2588
+ //! :end-before: example-end keys-bits-db
2589
+ //!
2590
+ //! @endrst
2591
+ //!
2592
+ //! @tparam KeyT
2593
+ //! **[inferred]** KeyT type
2594
+ //!
2595
+ //! @tparam NumItemsT
2596
+ //! **[inferred]** Type of num_items
2597
+ //!
2598
+ //! @tparam DecomposerT
2599
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2600
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2601
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2602
+ //! The leftmost element of the tuple is considered the most significant.
2603
+ //! The call operator must not modify members of the key.
2604
+ //!
2605
+ //! @param[in] d_temp_storage
2606
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2607
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2608
+ //! is done.
2609
+ //!
2610
+ //! @param[in,out] temp_storage_bytes
2611
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2612
+ //!
2613
+ //! @param[in,out] d_keys
2614
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2615
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2616
+ //! point to the sorted output keys
2617
+ //!
2618
+ //! @param[in] num_items
2619
+ //! Number of items to sort
2620
+ //!
2621
+ //! @param decomposer
2622
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2623
+ //! references to its constituent arithmetic types. The leftmost element of
2624
+ //! the tuple is considered the most significant. The call operator must not
2625
+ //! modify members of the key.
2626
+ //!
2627
+ //! @param[in] begin_bit
2628
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2629
+ //! key comparison
2630
+ //!
2631
+ //! @param[in] end_bit
2632
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2633
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2634
+ //!
2635
+ //! @param[in] stream
2636
+ //! **[optional]** CUDA stream to launch kernels within.
2637
+ //! Default is stream<sub>0</sub>.
2638
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2639
+ CUB_RUNTIME_FUNCTION static //
2640
+ ::cuda::std::enable_if_t< //
2641
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2642
+ cudaError_t>
2643
+ SortKeys(void* d_temp_storage,
2644
+ size_t& temp_storage_bytes,
2645
+ DoubleBuffer<KeyT>& d_keys,
2646
+ NumItemsT num_items,
2647
+ DecomposerT decomposer,
2648
+ int begin_bit,
2649
+ int end_bit,
2650
+ cudaStream_t stream = 0)
2651
+ {
2652
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2653
+
2654
+ // unsigned integer type for global offsets
2655
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2656
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2657
+
2658
+ static_assert(decomposer_check_t::value,
2659
+ "DecomposerT must be a callable object returning a tuple of references to "
2660
+ "arithmetic types");
2661
+
2662
+ constexpr bool is_overwrite_okay = true;
2663
+ DoubleBuffer<NullType> d_values;
2664
+
2665
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2666
+ decomposer_check_t{},
2667
+ d_temp_storage,
2668
+ temp_storage_bytes,
2669
+ is_overwrite_okay,
2670
+ d_keys,
2671
+ d_values,
2672
+ static_cast<offset_t>(num_items),
2673
+ decomposer,
2674
+ begin_bit,
2675
+ end_bit,
2676
+ stream);
2677
+ }
2678
+
2679
+ //! @rst Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2680
+ //!
2681
+ //! - The contents of the input data are not altered by the sorting operation.
2682
+ //! - Pointers to contiguous memory must be used; iterators are not currently
2683
+ //! supported.
2684
+ //! - In-place operations are not supported. There must be no overlap between
2685
+ //! any of the provided ranges:
2686
+ //!
2687
+ //! - ``[d_keys_in, d_keys_in + num_items)``
2688
+ //! - ``[d_keys_out, d_keys_out + num_items)``
2689
+ //!
2690
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2691
+ //! bits can be specified. This can reduce overall sorting overhead and
2692
+ //! yield a corresponding performance improvement.
2693
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
2694
+ //! the sorting interface using DoubleBuffer wrappers below.
2695
+ //! - @devicestorage
2696
+ //!
2697
+ //! Snippet
2698
+ //! --------------------------------------------------
2699
+ //!
2700
+ //! The code snippet below illustrates the sorting of a device vector of
2701
+ //! ``int`` keys.
2702
+ //! @endrst
2703
+ //!
2704
+ //! @code{.cpp}
2705
+ //! #include <cub/cub.cuh>
2706
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2707
+ //!
2708
+ //! // Declare, allocate, and initialize device-accessible pointers
2709
+ //! // for sorting data
2710
+ //! int num_items; // e.g., 7
2711
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2712
+ //! int *d_keys_out; // e.g., [ ... ]
2713
+ //! ...
2714
+ //!
2715
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2716
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2717
+ //!
2718
+ //! // Determine temporary device storage requirements
2719
+ //! void *d_temp_storage = nullptr;
2720
+ //! size_t temp_storage_bytes = 0;
2721
+ //! cub::DeviceRadixSort::SortKeysDescending(
2722
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2723
+ //!
2724
+ //! // Allocate temporary storage
2725
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2726
+ //!
2727
+ //! // Run sorting operation
2728
+ //! cub::DeviceRadixSort::SortKeysDescending(
2729
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2730
+ //!
2731
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
2732
+ //! @endcode
2733
+ //!
2734
+ //! @tparam KeyT
2735
+ //! **[inferred]** KeyT type
2736
+ //!
2737
+ //! @tparam NumItemsT
2738
+ //! **[inferred]** Type of num_items
2739
+ //!
2740
+ //! @param[in] d_temp_storage
2741
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2742
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2743
+ //! is done.
2744
+ //!
2745
+ //! @param[in,out] temp_storage_bytes
2746
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2747
+ //!
2748
+ //! @param[in] d_keys_in
2749
+ //! Pointer to the input data of key data to sort
2750
+ //!
2751
+ //! @param[out] d_keys_out
2752
+ //! Pointer to the sorted output sequence of key data
2753
+ //!
2754
+ //! @param[in] num_items
2755
+ //! Number of items to sort
2756
+ //!
2757
+ //! @param[in] begin_bit
2758
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2759
+ //! key comparison
2760
+ //!
2761
+ //! @param[in] end_bit
2762
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2763
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2764
+ //!
2765
+ //! @param[in] stream
2766
+ //! **[optional]** CUDA stream to launch kernels within.
2767
+ //! Default is stream<sub>0</sub>.
2768
+ template <typename KeyT, typename NumItemsT>
2769
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
2770
+ void* d_temp_storage,
2771
+ size_t& temp_storage_bytes,
2772
+ const KeyT* d_keys_in,
2773
+ KeyT* d_keys_out,
2774
+ NumItemsT num_items,
2775
+ int begin_bit = 0,
2776
+ int end_bit = sizeof(KeyT) * 8,
2777
+ cudaStream_t stream = 0)
2778
+ {
2779
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2780
+
2781
+ // Unsigned integer type for global offsets.
2782
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2783
+
2784
+ // We cast away const-ness, but will *not* write to these arrays.
2785
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2786
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2787
+ // is not set.
2788
+ constexpr bool is_overwrite_okay = false;
2789
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2790
+ DoubleBuffer<NullType> d_values;
2791
+
2792
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
2793
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2794
+ }
2795
+
2796
+ //! @rst
2797
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2798
+ //!
2799
+ //! * The contents of the input data are not altered by the sorting operation.
2800
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2801
+ //! supported.
2802
+ //! * In-place operations are not supported. There must be no overlap between
2803
+ //! any of the provided ranges:
2804
+ //!
2805
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2806
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2807
+ //!
2808
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2809
+ //! bits can be specified. This can reduce overall sorting overhead and
2810
+ //! yield a corresponding performance improvement.
2811
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2812
+ //! the sorting interface using DoubleBuffer wrappers below.
2813
+ //! * @devicestorage
2814
+ //!
2815
+ //! Snippet
2816
+ //! --------------------------------------------------
2817
+ //!
2818
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2819
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2820
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2821
+ //! tuple of references to relevant members of the key.
2822
+ //!
2823
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2824
+ //! :language: c++
2825
+ //! :dedent:
2826
+ //! :start-after: example-begin custom-type
2827
+ //! :end-before: example-end custom-type
2828
+ //!
2829
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2830
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2831
+ //!
2832
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2833
+ //! :language: c++
2834
+ //! :dedent:
2835
+ //! :start-after: example-begin keys-descending-bits
2836
+ //! :end-before: example-end keys-descending-bits
2837
+ //!
2838
+ //! @endrst
2839
+ //!
2840
+ //! @tparam KeyT
2841
+ //! **[inferred]** KeyT type
2842
+ //!
2843
+ //! @tparam NumItemsT
2844
+ //! **[inferred]** Type of num_items
2845
+ //!
2846
+ //! @tparam DecomposerT
2847
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2848
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2849
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2850
+ //! The leftmost element of the tuple is considered the most significant.
2851
+ //! The call operator must not modify members of the key.
2852
+ //!
2853
+ //! @param[in] d_temp_storage
2854
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2855
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2856
+ //! is done.
2857
+ //!
2858
+ //! @param[in,out] temp_storage_bytes
2859
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2860
+ //!
2861
+ //! @param[in] d_keys_in
2862
+ //! Pointer to the input data of key data to sort
2863
+ //!
2864
+ //! @param[out] d_keys_out
2865
+ //! Pointer to the sorted output sequence of key data
2866
+ //!
2867
+ //! @param[in] num_items
2868
+ //! Number of items to sort
2869
+ //!
2870
+ //! @param decomposer
2871
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2872
+ //! references to its constituent arithmetic types. The leftmost element of
2873
+ //! the tuple is considered the most significant. The call operator must not
2874
+ //! modify members of the key.
2875
+ //!
2876
+ //! @param[in] begin_bit
2877
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2878
+ //! key comparison
2879
+ //!
2880
+ //! @param[in] end_bit
2881
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2882
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2883
+ //!
2884
+ //! @param[in] stream
2885
+ //! **[optional]** CUDA stream to launch kernels within.
2886
+ //! Default is stream<sub>0</sub>.
2887
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2888
+ CUB_RUNTIME_FUNCTION static //
2889
+ ::cuda::std::enable_if_t< //
2890
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2891
+ cudaError_t>
2892
+ SortKeysDescending(
2893
+ void* d_temp_storage,
2894
+ size_t& temp_storage_bytes,
2895
+ const KeyT* d_keys_in,
2896
+ KeyT* d_keys_out,
2897
+ NumItemsT num_items,
2898
+ DecomposerT decomposer,
2899
+ int begin_bit,
2900
+ int end_bit,
2901
+ cudaStream_t stream = 0)
2902
+ {
2903
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2904
+
2905
+ // unsigned integer type for global offsets
2906
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2907
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2908
+
2909
+ static_assert(decomposer_check_t::value,
2910
+ "DecomposerT must be a callable object returning a tuple of references to "
2911
+ "arithmetic types");
2912
+
2913
+ // We cast away const-ness, but will *not* write to these arrays.
2914
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2915
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2916
+ // is not set.
2917
+ constexpr bool is_overwrite_okay = false;
2918
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2919
+ DoubleBuffer<NullType> d_values;
2920
+
2921
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
2922
+ decomposer_check_t{},
2923
+ d_temp_storage,
2924
+ temp_storage_bytes,
2925
+ is_overwrite_okay,
2926
+ d_keys,
2927
+ d_values,
2928
+ static_cast<offset_t>(num_items),
2929
+ decomposer,
2930
+ begin_bit,
2931
+ end_bit,
2932
+ stream);
2933
+ }
2934
+
2935
+ //! @rst
2936
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2937
+ //!
2938
+ //! * The contents of the input data are not altered by the sorting operation.
2939
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2940
+ //! supported.
2941
+ //! * In-place operations are not supported. There must be no overlap between
2942
+ //! any of the provided ranges:
2943
+ //!
2944
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2945
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2946
+ //!
2947
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2948
+ //! the sorting interface using DoubleBuffer wrappers below.
2949
+ //! * @devicestorage
2950
+ //!
2951
+ //! Snippet
2952
+ //! --------------------------------------------------
2953
+ //!
2954
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2955
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2956
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2957
+ //! tuple of references to relevant members of the key.
2958
+ //!
2959
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2960
+ //! :language: c++
2961
+ //! :dedent:
2962
+ //! :start-after: example-begin custom-type
2963
+ //! :end-before: example-end custom-type
2964
+ //!
2965
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2966
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2967
+ //!
2968
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2969
+ //! :language: c++
2970
+ //! :dedent:
2971
+ //! :start-after: example-begin keys-descending
2972
+ //! :end-before: example-end keys-descending
2973
+ //!
2974
+ //! @endrst
2975
+ //!
2976
+ //! @tparam KeyT
2977
+ //! **[inferred]** KeyT type
2978
+ //!
2979
+ //! @tparam NumItemsT
2980
+ //! **[inferred]** Type of num_items
2981
+ //!
2982
+ //! @tparam DecomposerT
2983
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2984
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2985
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2986
+ //! The leftmost element of the tuple is considered the most significant.
2987
+ //! The call operator must not modify members of the key.
2988
+ //!
2989
+ //! @param[in] d_temp_storage
2990
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2991
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2992
+ //! is done.
2993
+ //!
2994
+ //! @param[in,out] temp_storage_bytes
2995
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2996
+ //!
2997
+ //! @param[in] d_keys_in
2998
+ //! Pointer to the input data of key data to sort
2999
+ //!
3000
+ //! @param[out] d_keys_out
3001
+ //! Pointer to the sorted output sequence of key data
3002
+ //!
3003
+ //! @param[in] num_items
3004
+ //! Number of items to sort
3005
+ //!
3006
+ //! @param decomposer
3007
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3008
+ //! references to its constituent arithmetic types. The leftmost element of
3009
+ //! the tuple is considered the most significant. The call operator must not
3010
+ //! modify members of the key.
3011
+ //!
3012
+ //! @param[in] stream
3013
+ //! **[optional]** CUDA stream to launch kernels within.
3014
+ //! Default is stream<sub>0</sub>.
3015
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3016
+ CUB_RUNTIME_FUNCTION static //
3017
+ ::cuda::std::enable_if_t< //
3018
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3019
+ cudaError_t>
3020
+ SortKeysDescending(
3021
+ void* d_temp_storage,
3022
+ size_t& temp_storage_bytes,
3023
+ const KeyT* d_keys_in,
3024
+ KeyT* d_keys_out,
3025
+ NumItemsT num_items,
3026
+ DecomposerT decomposer,
3027
+ cudaStream_t stream = 0)
3028
+ {
3029
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3030
+
3031
+ // unsigned integer type for global offsets
3032
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3033
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3034
+
3035
+ static_assert(decomposer_check_t::value,
3036
+ "DecomposerT must be a callable object returning a tuple of references to "
3037
+ "arithmetic types");
3038
+
3039
+ // We cast away const-ness, but will *not* write to these arrays.
3040
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
3041
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
3042
+ // is not set.
3043
+ constexpr bool is_overwrite_okay = false;
3044
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
3045
+ DoubleBuffer<NullType> d_values;
3046
+
3047
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3048
+ decomposer_check_t{},
3049
+ d_temp_storage,
3050
+ temp_storage_bytes,
3051
+ is_overwrite_okay,
3052
+ d_keys,
3053
+ d_values,
3054
+ static_cast<offset_t>(num_items),
3055
+ decomposer,
3056
+ stream);
3057
+ }
3058
+
3059
+ //! @rst
3060
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3061
+ //!
3062
+ //! - The sorting operation is given a pair of key buffers managed by a
3063
+ //! DoubleBuffer structure that indicates which of the two buffers is
3064
+ //! "current" (and thus contains the input data to be sorted).
3065
+ //! - The contents of both buffers may be altered by the sorting operation.
3066
+ //! - In-place operations are not supported. There must be no overlap between
3067
+ //! any of the provided ranges:
3068
+ //!
3069
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
3070
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3071
+ //!
3072
+ //! - Upon completion, the sorting operation will update the "current"
3073
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3074
+ //! buffers now contains the sorted output sequence (a function of the
3075
+ //! number of key bits specified and the targeted device architecture).
3076
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
3077
+ //! bits can be specified. This can reduce overall sorting overhead and
3078
+ //! yield a corresponding performance improvement.
3079
+ //! - @devicestorageP
3080
+ //! - @devicestorage
3081
+ //!
3082
+ //! Snippet
3083
+ //! --------------------------------------------------
3084
+ //!
3085
+ //! The code snippet below illustrates the sorting of a device vector of ``int`` keys.
3086
+ //! @endrst
3087
+ //!
3088
+ //! @code{.cpp}
3089
+ //! #include <cub/cub.cuh>
3090
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
3091
+ //!
3092
+ //! // Declare, allocate, and initialize device-accessible pointers
3093
+ //! // for sorting data
3094
+ //! int num_items; // e.g., 7
3095
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
3096
+ //! int *d_key_alt_buf; // e.g., [ ... ]
3097
+ //! ...
3098
+ //!
3099
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
3100
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
3101
+ //!
3102
+ //! // Determine temporary device storage requirements
3103
+ //! void *d_temp_storage = nullptr;
3104
+ //! size_t temp_storage_bytes = 0;
3105
+ //! cub::DeviceRadixSort::SortKeysDescending(
3106
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3107
+ //!
3108
+ //! // Allocate temporary storage
3109
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
3110
+ //!
3111
+ //! // Run sorting operation
3112
+ //! cub::DeviceRadixSort::SortKeysDescending(
3113
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3114
+ //!
3115
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
3116
+ //! @endcode
3117
+ //!
3118
+ //! @tparam KeyT
3119
+ //! **[inferred]** KeyT type
3120
+ //!
3121
+ //! @tparam NumItemsT
3122
+ //! **[inferred]** Type of num_items
3123
+ //!
3124
+ //! @param[in] d_temp_storage
3125
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3126
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3127
+ //! is done.
3128
+ //!
3129
+ //! @param[in,out] temp_storage_bytes
3130
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3131
+ //!
3132
+ //! @param[in,out] d_keys
3133
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3134
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3135
+ //! point to the sorted output keys
3136
+ //!
3137
+ //! @param[in] num_items
3138
+ //! Number of items to sort
3139
+ //!
3140
+ //! @param[in] begin_bit
3141
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3142
+ //! key comparison
3143
+ //!
3144
+ //! @param[in] end_bit
3145
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3146
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
3147
+ //!
3148
+ //! @param[in] stream
3149
+ //! **[optional]** CUDA stream to launch kernels within.
3150
+ //! Default is stream<sub>0</sub>.
3151
+ template <typename KeyT, typename NumItemsT>
3152
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
3153
+ void* d_temp_storage,
3154
+ size_t& temp_storage_bytes,
3155
+ DoubleBuffer<KeyT>& d_keys,
3156
+ NumItemsT num_items,
3157
+ int begin_bit = 0,
3158
+ int end_bit = sizeof(KeyT) * 8,
3159
+ cudaStream_t stream = 0)
3160
+ {
3161
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3162
+
3163
+ // Unsigned integer type for global offsets.
3164
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
3165
+
3166
+ constexpr bool is_overwrite_okay = true;
3167
+
3168
+ // Null value type
3169
+ DoubleBuffer<NullType> d_values;
3170
+
3171
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
3172
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
3173
+ }
3174
+
3175
+ //! @rst
3176
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3177
+ //!
3178
+ //! * The sorting operation is given a pair of key buffers managed by a
3179
+ //! DoubleBuffer structure that indicates which of the two buffers is
3180
+ //! "current" (and thus contains the input data to be sorted).
3181
+ //! * The contents of both buffers may be altered by the sorting operation.
3182
+ //! * In-place operations are not supported. There must be no overlap between
3183
+ //! any of the provided ranges:
3184
+ //!
3185
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3186
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3187
+ //!
3188
+ //! * Upon completion, the sorting operation will update the "current"
3189
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3190
+ //! buffers now contains the sorted output sequence (a function of the
3191
+ //! number of key bits specified and the targeted device architecture).
3192
+ //! * @devicestorageP
3193
+ //! * @devicestorage
3194
+ //!
3195
+ //! Snippet
3196
+ //! --------------------------------------------------
3197
+ //!
3198
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3199
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3200
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3201
+ //! tuple of references to relevant members of the key.
3202
+ //!
3203
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3204
+ //! :language: c++
3205
+ //! :dedent:
3206
+ //! :start-after: example-begin custom-type
3207
+ //! :end-before: example-end custom-type
3208
+ //!
3209
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3210
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3211
+ //!
3212
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3213
+ //! :language: c++
3214
+ //! :dedent:
3215
+ //! :start-after: example-begin keys-descending-db
3216
+ //! :end-before: example-end keys-descending-db
3217
+ //!
3218
+ //! @endrst
3219
+ //!
3220
+ //! @tparam KeyT
3221
+ //! **[inferred]** KeyT type
3222
+ //!
3223
+ //! @tparam NumItemsT
3224
+ //! **[inferred]** Type of num_items
3225
+ //!
3226
+ //! @tparam DecomposerT
3227
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3228
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3229
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3230
+ //! The leftmost element of the tuple is considered the most significant.
3231
+ //! The call operator must not modify members of the key.
3232
+ //!
3233
+ //! @param[in] d_temp_storage
3234
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3235
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3236
+ //! is done.
3237
+ //!
3238
+ //! @param[in,out] temp_storage_bytes
3239
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3240
+ //!
3241
+ //! @param[in,out] d_keys
3242
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3243
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3244
+ //! point to the sorted output keys
3245
+ //!
3246
+ //! @param[in] num_items
3247
+ //! Number of items to sort
3248
+ //!
3249
+ //! @param decomposer
3250
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3251
+ //! references to its constituent arithmetic types. The leftmost element of
3252
+ //! the tuple is considered the most significant. The call operator must not
3253
+ //! modify members of the key.
3254
+ //!
3255
+ //! @param[in] stream
3256
+ //! **[optional]** CUDA stream to launch kernels within.
3257
+ //! Default is stream<sub>0</sub>.
3258
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3259
+ CUB_RUNTIME_FUNCTION static //
3260
+ ::cuda::std::enable_if_t< //
3261
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3262
+ cudaError_t>
3263
+ SortKeysDescending(
3264
+ void* d_temp_storage,
3265
+ size_t& temp_storage_bytes,
3266
+ DoubleBuffer<KeyT>& d_keys,
3267
+ NumItemsT num_items,
3268
+ DecomposerT decomposer,
3269
+ cudaStream_t stream = 0)
3270
+ {
3271
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3272
+
3273
+ // unsigned integer type for global offsets
3274
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3275
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3276
+
3277
+ static_assert(decomposer_check_t::value,
3278
+ "DecomposerT must be a callable object returning a tuple of references to "
3279
+ "arithmetic types");
3280
+
3281
+ constexpr bool is_overwrite_okay = true;
3282
+ DoubleBuffer<NullType> d_values;
3283
+
3284
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3285
+ decomposer_check_t{},
3286
+ d_temp_storage,
3287
+ temp_storage_bytes,
3288
+ is_overwrite_okay,
3289
+ d_keys,
3290
+ d_values,
3291
+ static_cast<offset_t>(num_items),
3292
+ decomposer,
3293
+ stream);
3294
+ }
3295
+
3296
+ //! @rst
3297
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3298
+ //!
3299
+ //! * The sorting operation is given a pair of key buffers managed by a
3300
+ //! DoubleBuffer structure that indicates which of the two buffers is
3301
+ //! "current" (and thus contains the input data to be sorted).
3302
+ //! * The contents of both buffers may be altered by the sorting operation.
3303
+ //! * In-place operations are not supported. There must be no overlap between
3304
+ //! any of the provided ranges:
3305
+ //!
3306
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3307
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3308
+ //!
3309
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
3310
+ //! differentiating key bits. This can reduce overall sorting overhead and
3311
+ //! yield a corresponding performance improvement.
3312
+ //! * Upon completion, the sorting operation will update the "current"
3313
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3314
+ //! buffers now contains the sorted output sequence (a function of the
3315
+ //! number of key bits specified and the targeted device architecture).
3316
+ //! * @devicestorageP
3317
+ //! * @devicestorage
3318
+ //!
3319
+ //! Snippet
3320
+ //! --------------------------------------------------
3321
+ //!
3322
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3323
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3324
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3325
+ //! tuple of references to relevant members of the key.
3326
+ //!
3327
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3328
+ //! :language: c++
3329
+ //! :dedent:
3330
+ //! :start-after: example-begin custom-type
3331
+ //! :end-before: example-end custom-type
3332
+ //!
3333
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3334
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3335
+ //!
3336
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3337
+ //! :language: c++
3338
+ //! :dedent:
3339
+ //! :start-after: example-begin keys-descending-bits-db
3340
+ //! :end-before: example-end keys-descending-bits-db
3341
+ //!
3342
+ //! @endrst
3343
+ //!
3344
+ //! @tparam KeyT
3345
+ //! **[inferred]** KeyT type
3346
+ //!
3347
+ //! @tparam NumItemsT
3348
+ //! **[inferred]** Type of num_items
3349
+ //!
3350
+ //! @tparam DecomposerT
3351
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3352
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3353
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3354
+ //! The leftmost element of the tuple is considered the most significant.
3355
+ //! The call operator must not modify members of the key.
3356
+ //!
3357
+ //! @param[in] d_temp_storage
3358
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3359
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3360
+ //! is done.
3361
+ //!
3362
+ //! @param[in,out] temp_storage_bytes
3363
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3364
+ //!
3365
+ //! @param[in,out] d_keys
3366
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3367
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3368
+ //! point to the sorted output keys
3369
+ //!
3370
+ //! @param[in] num_items
3371
+ //! Number of items to sort
3372
+ //!
3373
+ //! @param decomposer
3374
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3375
+ //! references to its constituent arithmetic types. The leftmost element of
3376
+ //! the tuple is considered the most significant. The call operator must not
3377
+ //! modify members of the key.
3378
+ //!
3379
+ //! @param[in] begin_bit
3380
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3381
+ //! key comparison
3382
+ //!
3383
+ //! @param[in] end_bit
3384
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3385
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
3386
+ //!
3387
+ //! @param[in] stream
3388
+ //! **[optional]** CUDA stream to launch kernels within.
3389
+ //! Default is stream<sub>0</sub>.
3390
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3391
+ CUB_RUNTIME_FUNCTION static //
3392
+ ::cuda::std::enable_if_t< //
3393
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3394
+ cudaError_t>
3395
+ SortKeysDescending(
3396
+ void* d_temp_storage,
3397
+ size_t& temp_storage_bytes,
3398
+ DoubleBuffer<KeyT>& d_keys,
3399
+ NumItemsT num_items,
3400
+ DecomposerT decomposer,
3401
+ int begin_bit,
3402
+ int end_bit,
3403
+ cudaStream_t stream = 0)
3404
+ {
3405
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3406
+
3407
+ // unsigned integer type for global offsets
3408
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3409
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3410
+
3411
+ static_assert(decomposer_check_t::value,
3412
+ "DecomposerT must be a callable object returning a tuple of references to "
3413
+ "arithmetic types");
3414
+
3415
+ constexpr bool is_overwrite_okay = true;
3416
+ DoubleBuffer<NullType> d_values;
3417
+
3418
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3419
+ decomposer_check_t{},
3420
+ d_temp_storage,
3421
+ temp_storage_bytes,
3422
+ is_overwrite_okay,
3423
+ d_keys,
3424
+ d_values,
3425
+ static_cast<offset_t>(num_items),
3426
+ decomposer,
3427
+ begin_bit,
3428
+ end_bit,
3429
+ stream);
3430
+ }
3431
+
3432
+ //! @} end member group
3433
+ };
3434
+
3435
+ CUB_NAMESPACE_END