cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,3437 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/device/dispatch/dispatch_radix_sort.cuh>
47
+
48
+ #include <cuda/std/__type_traits/enable_if.h>
49
+ #include <cuda/std/__type_traits/integral_constant.h>
50
+ #include <cuda/std/__type_traits/is_convertible.h>
51
+
52
+ CUB_NAMESPACE_BEGIN
53
+
54
+ //! @rst
55
+ //! DeviceRadixSort provides device-wide, parallel operations for
56
+ //! computing a radix sort across a sequence of data items residing
57
+ //! within device-accessible memory.
58
+ //!
59
+ //! .. image:: ../../img/sorting_logo.png
60
+ //! :align: center
61
+ //!
62
+ //! Overview
63
+ //! --------------------------------------------------
64
+ //!
65
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
66
+ //! arranges items into ascending (or descending) order. The algorithm relies
67
+ //! upon a positional representation for keys, i.e., each key is comprised of an
68
+ //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
69
+ //! least-significant to most-significant. For a given input sequence of keys
70
+ //! and a set of rules specifying a total ordering of the symbolic alphabet, the
71
+ //! radix sorting method produces a lexicographic ordering of those keys.
72
+ //!
73
+ //! @rowmajor
74
+ //!
75
+ //! Supported Types
76
+ //! --------------------------------------------------
77
+ //!
78
+ //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
79
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
80
+ //! and ``__nv_bfloat16`` 16-bit floating-point types. User-defined types are
81
+ //! supported as long as a decomposer object is provided.
82
+ //!
83
+ //! Floating-Point Special Cases
84
+ //! --------------------------------------------------
85
+ //!
86
+ //! - Positive and negative zeros are considered equivalent, and will be treated
87
+ //! as such in the output.
88
+ //! - No special handling is implemented for NaN values; these are sorted
89
+ //! according to their bit representations after any transformations.
90
+ //!
91
+ //! Transformations
92
+ //! --------------------------------------------------
93
+ //!
94
+ //! Although the direct radix sorting method can only be applied to unsigned
95
+ //! integral types, DeviceRadixSort is able to sort signed and floating-point
96
+ //! types via simple bit-wise transformations that ensure lexicographic key
97
+ //! ordering. Additional transformations occur for descending sorts. These
98
+ //! transformations must be considered when restricting the
99
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
100
+ //! before the bit-range truncation.
101
+ //!
102
+ //! Any transformations applied to the keys prior to sorting are reversed
103
+ //! while writing to the final output buffer.
104
+ //!
105
+ //! Type Specific Bitwise Transformations
106
+ //! --------------------------------------------------
107
+ //!
108
+ //! To convert the input values into a radix-sortable bitwise representation,
109
+ //! the following transformations take place prior to sorting:
110
+ //!
111
+ //! - For unsigned integral values, the keys are used directly.
112
+ //! - For signed integral values, the sign bit is inverted.
113
+ //! - For positive floating point values, the sign bit is inverted.
114
+ //! - For negative floating point values, the full key is inverted.
115
+ //!
116
+ //! For floating point types, positive and negative zero are a special case and
117
+ //! will be considered equivalent during sorting.
118
+ //!
119
+ //! Descending Sort Bitwise Transformations
120
+ //! --------------------------------------------------
121
+ //!
122
+ //! If descending sort is used, the keys are inverted after performing any
123
+ //! type-specific transformations, and the resulting keys are sorted in ascending
124
+ //! order.
125
+ //!
126
+ //! Stability
127
+ //! --------------------------------------------------
128
+ //!
129
+ //! DeviceRadixSort is stable. For floating-point types, ``-0.0`` and ``+0.0`` are
130
+ //! considered equal and appear in the result in the same order as they appear in
131
+ //! the input.
132
+ //!
133
+ //! Usage Considerations
134
+ //! --------------------------------------------------
135
+ //!
136
+ //! @cdp_class{DeviceRadixSort}
137
+ //!
138
+ //! Performance
139
+ //! --------------------------------------------------
140
+ //!
141
+ //! @linear_performance{radix sort}
142
+ //!
143
+ //! @endrst
144
+ struct DeviceRadixSort
145
+ {
146
+ private:
147
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
148
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
149
+ ::cuda::std::false_type,
150
+ void* d_temp_storage,
151
+ size_t& temp_storage_bytes,
152
+ bool is_overwrite_okay,
153
+ DoubleBuffer<KeyT>& d_keys,
154
+ DoubleBuffer<ValueT>& d_values,
155
+ NumItemsT num_items,
156
+ DecomposerT decomposer,
157
+ int begin_bit,
158
+ int end_bit,
159
+ cudaStream_t stream);
160
+
161
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
162
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
163
+ ::cuda::std::true_type,
164
+ void* d_temp_storage,
165
+ size_t& temp_storage_bytes,
166
+ bool is_overwrite_okay,
167
+ DoubleBuffer<KeyT>& d_keys,
168
+ DoubleBuffer<ValueT>& d_values,
169
+ OffsetT num_items,
170
+ DecomposerT decomposer,
171
+ int begin_bit,
172
+ int end_bit,
173
+ cudaStream_t stream)
174
+ {
175
+ return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, DecomposerT>::Dispatch(
176
+ d_temp_storage,
177
+ temp_storage_bytes,
178
+ d_keys,
179
+ d_values,
180
+ static_cast<OffsetT>(num_items),
181
+ begin_bit,
182
+ end_bit,
183
+ is_overwrite_okay,
184
+ stream,
185
+ decomposer);
186
+ }
187
+
188
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
189
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
190
+ ::cuda::std::false_type,
191
+ void* d_temp_storage,
192
+ size_t& temp_storage_bytes,
193
+ bool is_overwrite_okay,
194
+ DoubleBuffer<KeyT>& d_keys,
195
+ DoubleBuffer<ValueT>& d_values,
196
+ NumItemsT num_items,
197
+ DecomposerT decomposer,
198
+ cudaStream_t stream);
199
+
200
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
201
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
202
+ ::cuda::std::true_type,
203
+ void* d_temp_storage,
204
+ size_t& temp_storage_bytes,
205
+ bool is_overwrite_okay,
206
+ DoubleBuffer<KeyT>& d_keys,
207
+ DoubleBuffer<ValueT>& d_values,
208
+ OffsetT num_items,
209
+ DecomposerT decomposer,
210
+ cudaStream_t stream)
211
+ {
212
+ constexpr int begin_bit = 0;
213
+ const int end_bit = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
214
+
215
+ return DeviceRadixSort::custom_radix_sort<Order>(
216
+ ::cuda::std::true_type{},
217
+ d_temp_storage,
218
+ temp_storage_bytes,
219
+ is_overwrite_okay,
220
+ d_keys,
221
+ d_values,
222
+ num_items,
223
+ decomposer,
224
+ begin_bit,
225
+ end_bit,
226
+ stream);
227
+ }
228
+
229
+ // Name reported for NVTX ranges
230
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
231
+ {
232
+ return "cub::DeviceRadixSort";
233
+ }
234
+
235
+ public:
236
+ //! @name KeyT-value pairs
237
+ //! @{
238
+
239
+ //! @rst
240
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
241
+ //!
242
+ //! - The contents of the input data are not altered by the sorting operation.
243
+ //! - Pointers to contiguous memory must be used; iterators are not currently
244
+ //! supported.
245
+ //! - In-place operations are not supported. There must be no overlap between
246
+ //! any of the provided ranges:
247
+ //!
248
+ //! - ``[d_keys_in, d_keys_in + num_items)``
249
+ //! - ``[d_keys_out, d_keys_out + num_items)``
250
+ //! - ``[d_values_in, d_values_in + num_items)``
251
+ //! - ``[d_values_out, d_values_out + num_items)``
252
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
253
+ //! bits can be specified. This can reduce overall sorting overhead and
254
+ //! yield a corresponding performance improvement.
255
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
256
+ //! the sorting interface using DoubleBuffer wrappers below.
257
+ //! - @devicestorage
258
+ //!
259
+ //! Snippet
260
+ //! --------------------------------------------------
261
+ //!
262
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
263
+ //! keys with associated vector of ``int`` values.
264
+ //! @endrst
265
+ //!
266
+ //! @code{.cpp}
267
+ //! #include <cub/cub.cuh>
268
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
269
+ //!
270
+ //! // Declare, allocate, and initialize device-accessible pointers
271
+ //! // for sorting data
272
+ //! int num_items; // e.g., 7
273
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
274
+ //! int *d_keys_out; // e.g., [ ... ]
275
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
276
+ //! int *d_values_out; // e.g., [ ... ]
277
+ //! ...
278
+ //!
279
+ //! // Determine temporary device storage requirements
280
+ //! void *d_temp_storage = nullptr;
281
+ //! size_t temp_storage_bytes = 0;
282
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
283
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
284
+ //!
285
+ //! // Allocate temporary storage
286
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
287
+ //!
288
+ //! // Run sorting operation
289
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
290
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
291
+ //!
292
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
293
+ //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
294
+ //! @endcode
295
+ //!
296
+ //! @tparam KeyT
297
+ //! **[inferred]** KeyT type
298
+ //!
299
+ //! @tparam ValueT
300
+ //! **[inferred]** ValueT type
301
+ //!
302
+ //! @tparam NumItemsT
303
+ //! **[inferred]** Type of num_items
304
+ //!
305
+ //! @param[in] d_temp_storage
306
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
307
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
308
+ //! is done.
309
+ //!
310
+ //! @param[in,out] temp_storage_bytes
311
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
312
+ //!
313
+ //! @param[in] d_keys_in
314
+ //! Pointer to the input data of key data to sort
315
+ //!
316
+ //! @param[out] d_keys_out
317
+ //! Pointer to the sorted output sequence of key data
318
+ //!
319
+ //! @param[in] d_values_in
320
+ //! Pointer to the corresponding input sequence of associated value items
321
+ //!
322
+ //! @param[out] d_values_out
323
+ //! Pointer to the correspondingly-reordered output sequence of associated
324
+ //! value items
325
+ //!
326
+ //! @param[in] num_items
327
+ //! Number of items to sort
328
+ //!
329
+ //! @param[in] begin_bit
330
+ //! **[optional]** The least-significant bit index (inclusive) needed for
331
+ //! key comparison
332
+ //!
333
+ //! @param[in] end_bit
334
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
335
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
336
+ //!
337
+ //! @param[in] stream
338
+ //! **[optional]** CUDA stream to launch kernels within.
339
+ //! Default is stream<sub>0</sub>.
340
+ template <typename KeyT, typename ValueT, typename NumItemsT>
341
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
342
+ void* d_temp_storage,
343
+ size_t& temp_storage_bytes,
344
+ const KeyT* d_keys_in,
345
+ KeyT* d_keys_out,
346
+ const ValueT* d_values_in,
347
+ ValueT* d_values_out,
348
+ NumItemsT num_items,
349
+ int begin_bit = 0,
350
+ int end_bit = sizeof(KeyT) * 8,
351
+ cudaStream_t stream = 0)
352
+ {
353
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
354
+ // Unsigned integer type for global offsets.
355
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
356
+
357
+ // TODO API that doesn't accept decomposer should also contain a static
358
+ // assert that the key type is fundamental.
359
+
360
+ // We cast away const-ness, but will *not* write to these arrays.
361
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
362
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
363
+ // is not set.
364
+ constexpr bool is_overwrite_okay = false;
365
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
366
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
367
+
368
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
369
+ d_temp_storage,
370
+ temp_storage_bytes,
371
+ d_keys,
372
+ d_values,
373
+ static_cast<OffsetT>(num_items),
374
+ begin_bit,
375
+ end_bit,
376
+ is_overwrite_okay,
377
+ stream);
378
+ }
379
+
380
+ //! @rst
381
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
382
+ //!
383
+ //! * The contents of the input data are not altered by the sorting operation.
384
+ //! * Pointers to contiguous memory must be used; iterators are not currently
385
+ //! supported.
386
+ //! * In-place operations are not supported. There must be no overlap between
387
+ //! any of the provided ranges:
388
+ //!
389
+ //! * ``[d_keys_in, d_keys_in + num_items)``
390
+ //! * ``[d_keys_out, d_keys_out + num_items)``
391
+ //! * ``[d_values_in, d_values_in + num_items)``
392
+ //! * ``[d_values_out, d_values_out + num_items)``
393
+ //!
394
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
395
+ //! differentiating key bits. This can reduce overall sorting overhead and
396
+ //! yield a corresponding performance improvement.
397
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
398
+ //! the sorting interface using DoubleBuffer wrappers below.
399
+ //! * @devicestorage
400
+ //!
401
+ //! Snippet
402
+ //! --------------------------------------------------
403
+ //!
404
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
405
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
406
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
407
+ //! tuple of references to relevant members of the key.
408
+ //!
409
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
410
+ //! :language: c++
411
+ //! :dedent:
412
+ //! :start-after: example-begin custom-type
413
+ //! :end-before: example-end custom-type
414
+ //!
415
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
416
+ //! using ``cub::DeviceRadixSort::SortPairs``:
417
+ //!
418
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
419
+ //! :language: c++
420
+ //! :dedent:
421
+ //! :start-after: example-begin pairs-bits
422
+ //! :end-before: example-end pairs-bits
423
+ //!
424
+ //! @endrst
425
+ //!
426
+ //! @tparam KeyT
427
+ //! **[inferred]** KeyT type
428
+ //!
429
+ //! @tparam ValueT
430
+ //! **[inferred]** ValueT type
431
+ //!
432
+ //! @tparam NumItemsT
433
+ //! **[inferred]** Type of num_items
434
+ //!
435
+ //! @tparam DecomposerT
436
+ //! **[inferred]** Type of a callable object responsible for decomposing a
437
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
438
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
439
+ //! The leftmost element of the tuple is considered the most significant.
440
+ //! The call operator must not modify members of the key.
441
+ //!
442
+ //! @param[in] d_temp_storage
443
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
444
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
445
+ //! is done.
446
+ //!
447
+ //! @param[in,out] temp_storage_bytes
448
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
449
+ //!
450
+ //! @param[in] d_keys_in
451
+ //! Pointer to the input data of key data to sort
452
+ //!
453
+ //! @param[out] d_keys_out
454
+ //! Pointer to the sorted output sequence of key data
455
+ //!
456
+ //! @param[in] d_values_in
457
+ //! Pointer to the corresponding input sequence of associated value items
458
+ //!
459
+ //! @param[out] d_values_out
460
+ //! Pointer to the correspondingly-reordered output sequence of associated
461
+ //! value items
462
+ //!
463
+ //! @param[in] num_items
464
+ //! Number of items to sort
465
+ //!
466
+ //! @param decomposer
467
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
468
+ //! references to its constituent arithmetic types. The leftmost element of
469
+ //! the tuple is considered the most significant. The call operator must not
470
+ //! modify members of the key.
471
+ //!
472
+ //! @param[in] begin_bit
473
+ //! **[optional]** The least-significant bit index (inclusive) needed for
474
+ //! key comparison
475
+ //!
476
+ //! @param[in] end_bit
477
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
478
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
479
+ //!
480
+ //! @param[in] stream
481
+ //! **[optional]** CUDA stream to launch kernels within.
482
+ //! Default is stream<sub>0</sub>.
483
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
484
+ CUB_RUNTIME_FUNCTION static //
485
+ ::cuda::std::enable_if_t< //
486
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
487
+ cudaError_t>
488
+ SortPairs(void* d_temp_storage,
489
+ size_t& temp_storage_bytes,
490
+ const KeyT* d_keys_in,
491
+ KeyT* d_keys_out,
492
+ const ValueT* d_values_in,
493
+ ValueT* d_values_out,
494
+ NumItemsT num_items,
495
+ DecomposerT decomposer,
496
+ int begin_bit,
497
+ int end_bit,
498
+ cudaStream_t stream = 0)
499
+ {
500
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
501
+ // unsigned integer type for global offsets
502
+ using offset_t = detail::choose_offset_t<NumItemsT>;
503
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
504
+
505
+ static_assert(decomposer_check_t::value,
506
+ "DecomposerT must be a callable object returning a tuple of references to "
507
+ "arithmetic types");
508
+
509
+ // We cast away const-ness, but will *not* write to these arrays.
510
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
511
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
512
+ // is not set.
513
+ constexpr bool is_overwrite_okay = false;
514
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
515
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
516
+
517
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
518
+ decomposer_check_t{},
519
+ d_temp_storage,
520
+ temp_storage_bytes,
521
+ is_overwrite_okay,
522
+ d_keys,
523
+ d_values,
524
+ static_cast<offset_t>(num_items),
525
+ decomposer,
526
+ begin_bit,
527
+ end_bit,
528
+ stream);
529
+ }
530
+
531
+ //! @rst
532
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
533
+ //!
534
+ //! * The contents of the input data are not altered by the sorting operation.
535
+ //! * Pointers to contiguous memory must be used; iterators are not currently
536
+ //! supported.
537
+ //! * In-place operations are not supported. There must be no overlap between
538
+ //! any of the provided ranges:
539
+ //!
540
+ //! * ``[d_keys_in, d_keys_in + num_items)``
541
+ //! * ``[d_keys_out, d_keys_out + num_items)``
542
+ //! * ``[d_values_in, d_values_in + num_items)``
543
+ //! * ``[d_values_out, d_values_out + num_items)``
544
+ //!
545
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
546
+ //! the sorting interface using DoubleBuffer wrappers below.
547
+ //! * @devicestorage
548
+ //!
549
+ //! Snippet
550
+ //! --------------------------------------------------
551
+ //!
552
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
553
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
554
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
555
+ //! tuple of references to relevant members of the key.
556
+ //!
557
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
558
+ //! :language: c++
559
+ //! :dedent:
560
+ //! :start-after: example-begin custom-type
561
+ //! :end-before: example-end custom-type
562
+ //!
563
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
564
+ //! using ``cub::DeviceRadixSort::SortPairs``:
565
+ //!
566
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
567
+ //! :language: c++
568
+ //! :dedent:
569
+ //! :start-after: example-begin pairs
570
+ //! :end-before: example-end pairs
571
+ //!
572
+ //! @endrst
573
+ //!
574
+ //! @tparam KeyT
575
+ //! **[inferred]** KeyT type
576
+ //!
577
+ //! @tparam ValueT
578
+ //! **[inferred]** ValueT type
579
+ //!
580
+ //! @tparam NumItemsT
581
+ //! **[inferred]** Type of num_items
582
+ //!
583
+ //! @tparam DecomposerT
584
+ //! **[inferred]** Type of a callable object responsible for decomposing a
585
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
586
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
587
+ //! The leftmost element of the tuple is considered the most significant.
588
+ //! The call operator must not modify members of the key.
589
+ //!
590
+ //! @param[in] d_temp_storage
591
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
592
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
593
+ //! is done.
594
+ //!
595
+ //! @param[in,out] temp_storage_bytes
596
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
597
+ //!
598
+ //! @param[in] d_keys_in
599
+ //! Pointer to the input data of key data to sort
600
+ //!
601
+ //! @param[out] d_keys_out
602
+ //! Pointer to the sorted output sequence of key data
603
+ //!
604
+ //! @param[in] d_values_in
605
+ //! Pointer to the corresponding input sequence of associated value items
606
+ //!
607
+ //! @param[out] d_values_out
608
+ //! Pointer to the correspondingly-reordered output sequence of associated
609
+ //! value items
610
+ //!
611
+ //! @param[in] num_items
612
+ //! Number of items to sort
613
+ //!
614
+ //! @param decomposer
615
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
616
+ //! references to its constituent arithmetic types. The leftmost element of
617
+ //! the tuple is considered the most significant. The call operator must not
618
+ //! modify members of the key.
619
+ //!
620
+ //! @param[in] stream
621
+ //! **[optional]** CUDA stream to launch kernels within.
622
+ //! Default is stream<sub>0</sub>.
623
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
624
+ CUB_RUNTIME_FUNCTION static //
625
+ ::cuda::std::enable_if_t< //
626
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
627
+ cudaError_t>
628
+ SortPairs(void* d_temp_storage,
629
+ size_t& temp_storage_bytes,
630
+ const KeyT* d_keys_in,
631
+ KeyT* d_keys_out,
632
+ const ValueT* d_values_in,
633
+ ValueT* d_values_out,
634
+ NumItemsT num_items,
635
+ DecomposerT decomposer,
636
+ cudaStream_t stream = 0)
637
+ {
638
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
639
+ // unsigned integer type for global offsets
640
+ using offset_t = detail::choose_offset_t<NumItemsT>;
641
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
642
+
643
+ static_assert(decomposer_check_t::value,
644
+ "DecomposerT must be a callable object returning a tuple of references to "
645
+ "arithmetic types");
646
+
647
+ // We cast away const-ness, but will *not* write to these arrays.
648
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
649
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
650
+ // is not set.
651
+ constexpr bool is_overwrite_okay = false;
652
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
653
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
654
+
655
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
656
+ decomposer_check_t{},
657
+ d_temp_storage,
658
+ temp_storage_bytes,
659
+ is_overwrite_okay,
660
+ d_keys,
661
+ d_values,
662
+ static_cast<offset_t>(num_items),
663
+ decomposer,
664
+ stream);
665
+ }
666
+
667
+ //! @rst
668
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
669
+ //!
670
+ //! - The sorting operation is given a pair of key buffers and a corresponding
671
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
672
+ //! structure that indicates which of the two buffers is "current" (and thus
673
+ //! contains the input data to be sorted).
674
+ //! - The contents of both buffers within each pair may be altered by the
675
+ //! sorting operation.
676
+ //! - In-place operations are not supported. There must be no overlap between
677
+ //! any of the provided ranges:
678
+ //!
679
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
680
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
681
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
682
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
683
+ //!
684
+ //! - Upon completion, the sorting operation will update the "current"
685
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
686
+ //! buffers now contains the sorted output sequence (a function of the
687
+ //! number of key bits specified and the targeted device architecture).
688
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
689
+ //! bits can be specified. This can reduce overall sorting overhead and
690
+ //! yield a corresponding performance improvement.
691
+ //! - @devicestorageP
692
+ //! - @devicestorage
693
+ //!
694
+ //! Snippet
695
+ //! --------------------------------------------------
696
+ //!
697
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
698
+ //! keys with associated vector of ``int`` values.
699
+ //! @endrst
700
+ //!
701
+ //! @code
702
+ //! #include <cub/cub.cuh>
703
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
704
+ //!
705
+ //! // Declare, allocate, and initialize device-accessible pointers for
706
+ //! // sorting data
707
+ //! int num_items; // e.g., 7
708
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
709
+ //! int *d_key_alt_buf; // e.g., [ ... ]
710
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
711
+ //! int *d_value_alt_buf; // e.g., [ ... ]
712
+ //! ...
713
+ //!
714
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
715
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
716
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
717
+ //!
718
+ //! // Determine temporary device storage requirements
719
+ //! void *d_temp_storage = nullptr;
720
+ //! size_t temp_storage_bytes = 0;
721
+ //! cub::DeviceRadixSort::SortPairs(
722
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
723
+ //!
724
+ //! // Allocate temporary storage
725
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
726
+ //!
727
+ //! // Run sorting operation
728
+ //! cub::DeviceRadixSort::SortPairs(
729
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
730
+ //!
731
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
732
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
733
+ //!
734
+ //! @endcode
735
+ //!
736
+ //! @tparam KeyT
737
+ //! **[inferred]** KeyT type
738
+ //!
739
+ //! @tparam ValueT
740
+ //! **[inferred]** ValueT type
741
+ //!
742
+ //! @tparam NumItemsT
743
+ //! **[inferred]** Type of num_items
744
+ //!
745
+ //! @param[in] d_temp_storage
746
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
747
+ //! required allocation size is written to ``temp_storage_bytes`` and no work is done.
748
+ //!
749
+ //! @param[in,out] temp_storage_bytes
750
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
751
+ //!
752
+ //! @param[in,out] d_keys
753
+ //! Reference to the double-buffer of keys whose "current" device-accessible
754
+ //! buffer contains the unsorted input keys and, upon return, is updated to
755
+ //! point to the sorted output keys
756
+ //!
757
+ //! @param[in,out] d_values
758
+ //! Double-buffer of values whose "current" device-accessible buffer
759
+ //! contains the unsorted input values and, upon return, is updated to point
760
+ //! to the sorted output values
761
+ //!
762
+ //! @param[in] num_items
763
+ //! Number of items to sort
764
+ //!
765
+ //! @param[in] begin_bit
766
+ //! **[optional]** The least-significant bit index (inclusive) needed for
767
+ //! key comparison
768
+ //!
769
+ //! @param[in] end_bit
770
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
771
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
772
+ //!
773
+ //! @param[in] stream
774
+ //! **[optional]** CUDA stream to launch kernels within.
775
+ //! Default is stream<sub>0</sub>.
776
+ template <typename KeyT, typename ValueT, typename NumItemsT>
777
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
778
+ void* d_temp_storage,
779
+ size_t& temp_storage_bytes,
780
+ DoubleBuffer<KeyT>& d_keys,
781
+ DoubleBuffer<ValueT>& d_values,
782
+ NumItemsT num_items,
783
+ int begin_bit = 0,
784
+ int end_bit = sizeof(KeyT) * 8,
785
+ cudaStream_t stream = 0)
786
+ {
787
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
788
+
789
+ // Unsigned integer type for global offsets.
790
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
791
+
792
+ constexpr bool is_overwrite_okay = true;
793
+
794
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
795
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
796
+ }
797
+
798
+ //! @rst
799
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
800
+ //!
801
+ //! * The sorting operation is given a pair of key buffers and a corresponding
802
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
803
+ //! structure that indicates which of the two buffers is "current" (and thus
804
+ //! contains the input data to be sorted).
805
+ //! * The contents of both buffers within each pair may be altered by the
806
+ //! sorting operation.
807
+ //! * In-place operations are not supported. There must be no overlap between
808
+ //! any of the provided ranges:
809
+ //!
810
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
811
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
812
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
813
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
814
+ //!
815
+ //! - Upon completion, the sorting operation will update the "current"
816
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
817
+ //! buffers now contains the sorted output sequence (a function of the
818
+ //! number of key bits specified and the targeted device architecture).
819
+ //! - @devicestorageP
820
+ //! - @devicestorage
821
+ //!
822
+ //! Snippet
823
+ //! --------------------------------------------------
824
+ //!
825
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
826
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
827
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
828
+ //! tuple of references to relevant members of the key.
829
+ //!
830
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
831
+ //! :language: c++
832
+ //! :dedent:
833
+ //! :start-after: example-begin custom-type
834
+ //! :end-before: example-end custom-type
835
+ //!
836
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
837
+ //! using ``cub::DeviceRadixSort::SortPairs``:
838
+ //!
839
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
840
+ //! :language: c++
841
+ //! :dedent:
842
+ //! :start-after: example-begin pairs-db
843
+ //! :end-before: example-end pairs-db
844
+ //!
845
+ //! @endrst
846
+ //!
847
+ //! @tparam KeyT
848
+ //! **[inferred]** KeyT type
849
+ //!
850
+ //! @tparam ValueT
851
+ //! **[inferred]** ValueT type
852
+ //!
853
+ //! @tparam NumItemsT
854
+ //! **[inferred]** Type of num_items
855
+ //!
856
+ //! @tparam DecomposerT
857
+ //! **[inferred]** Type of a callable object responsible for decomposing a
858
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
859
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
860
+ //! The leftmost element of the tuple is considered the most significant.
861
+ //! The call operator must not modify members of the key.
862
+ //!
863
+ //! @param[in] d_temp_storage
864
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
865
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
866
+ //! is done.
867
+ //!
868
+ //! @param[in,out] temp_storage_bytes
869
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
870
+ //!
871
+ //! @param[in,out] d_keys
872
+ //! Reference to the double-buffer of keys whose "current" device-accessible
873
+ //! buffer contains the unsorted input keys and, upon return, is updated to
874
+ //! point to the sorted output keys
875
+ //!
876
+ //! @param[in,out] d_values
877
+ //! Double-buffer of values whose "current" device-accessible buffer
878
+ //! contains the unsorted input values and, upon return, is updated to point
879
+ //! to the sorted output values
880
+ //!
881
+ //! @param[in] num_items
882
+ //! Number of items to sort
883
+ //!
884
+ //! @param decomposer
885
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
886
+ //! references to its constituent arithmetic types. The leftmost element of
887
+ //! the tuple is considered the most significant. The call operator must not
888
+ //! modify members of the key.
889
+ //!
890
+ //! @param[in] stream
891
+ //! **[optional]** CUDA stream to launch kernels within.
892
+ //! Default is stream<sub>0</sub>.
893
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
894
+ CUB_RUNTIME_FUNCTION static //
895
+ ::cuda::std::enable_if_t< //
896
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
897
+ cudaError_t>
898
+ SortPairs(void* d_temp_storage,
899
+ size_t& temp_storage_bytes,
900
+ DoubleBuffer<KeyT>& d_keys,
901
+ DoubleBuffer<ValueT>& d_values,
902
+ NumItemsT num_items,
903
+ DecomposerT decomposer,
904
+ cudaStream_t stream = 0)
905
+ {
906
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
907
+
908
+ // unsigned integer type for global offsets
909
+ using offset_t = detail::choose_offset_t<NumItemsT>;
910
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
911
+
912
+ static_assert(decomposer_check_t::value,
913
+ "DecomposerT must be a callable object returning a tuple of references to "
914
+ "arithmetic types");
915
+
916
+ constexpr bool is_overwrite_okay = true;
917
+
918
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
919
+ decomposer_check_t{},
920
+ d_temp_storage,
921
+ temp_storage_bytes,
922
+ is_overwrite_okay,
923
+ d_keys,
924
+ d_values,
925
+ static_cast<offset_t>(num_items),
926
+ decomposer,
927
+ stream);
928
+ }
929
+
930
+ //! @rst
931
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
932
+ //!
933
+ //! * The sorting operation is given a pair of key buffers and a corresponding
934
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
935
+ //! structure that indicates which of the two buffers is "current" (and thus
936
+ //! contains the input data to be sorted).
937
+ //! * The contents of both buffers within each pair may be altered by the
938
+ //! sorting operation.
939
+ //! * In-place operations are not supported. There must be no overlap between
940
+ //! any of the provided ranges:
941
+ //!
942
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
943
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
944
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
945
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
946
+ //!
947
+ //! - Upon completion, the sorting operation will update the "current"
948
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
949
+ //! buffers now contains the sorted output sequence (a function of the
950
+ //! number of key bits specified and the targeted device architecture).
951
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
952
+ //! bits can be specified. This can reduce overall sorting overhead and
953
+ //! yield a corresponding performance improvement.
954
+ //! - @devicestorageP
955
+ //! - @devicestorage
956
+ //!
957
+ //! Snippet
958
+ //! --------------------------------------------------
959
+ //!
960
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
961
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
962
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
963
+ //! tuple of references to relevant members of the key.
964
+ //!
965
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
966
+ //! :language: c++
967
+ //! :dedent:
968
+ //! :start-after: example-begin custom-type
969
+ //! :end-before: example-end custom-type
970
+ //!
971
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
972
+ //! using ``cub::DeviceRadixSort::SortPairs``:
973
+ //!
974
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
975
+ //! :language: c++
976
+ //! :dedent:
977
+ //! :start-after: example-begin pairs-bits-db
978
+ //! :end-before: example-end pairs-bits-db
979
+ //!
980
+ //! @endrst
981
+ //!
982
+ //! @tparam KeyT
983
+ //! **[inferred]** KeyT type
984
+ //!
985
+ //! @tparam ValueT
986
+ //! **[inferred]** ValueT type
987
+ //!
988
+ //! @tparam NumItemsT
989
+ //! **[inferred]** Type of num_items
990
+ //!
991
+ //! @tparam DecomposerT
992
+ //! **[inferred]** Type of a callable object responsible for decomposing a
993
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
994
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
995
+ //! The leftmost element of the tuple is considered the most significant.
996
+ //! The call operator must not modify members of the key.
997
+ //!
998
+ //! @param[in] d_temp_storage
999
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1000
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1001
+ //! is done.
1002
+ //!
1003
+ //! @param[in,out] temp_storage_bytes
1004
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1005
+ //!
1006
+ //! @param[in,out] d_keys
1007
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1008
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1009
+ //! point to the sorted output keys
1010
+ //!
1011
+ //! @param[in,out] d_values
1012
+ //! Double-buffer of values whose "current" device-accessible buffer
1013
+ //! contains the unsorted input values and, upon return, is updated to point
1014
+ //! to the sorted output values
1015
+ //!
1016
+ //! @param[in] num_items
1017
+ //! Number of items to sort
1018
+ //!
1019
+ //! @param decomposer
1020
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1021
+ //! references to its constituent arithmetic types. The leftmost element of
1022
+ //! the tuple is considered the most significant. The call operator must not
1023
+ //! modify members of the key.
1024
+ //!
1025
+ //! @param[in] begin_bit
1026
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1027
+ //! key comparison
1028
+ //!
1029
+ //! @param[in] end_bit
1030
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1031
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1032
+ //!
1033
+ //! @param[in] stream
1034
+ //! **[optional]** CUDA stream to launch kernels within.
1035
+ //! Default is stream<sub>0</sub>.
1036
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1037
+ CUB_RUNTIME_FUNCTION static //
1038
+ ::cuda::std::enable_if_t< //
1039
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1040
+ cudaError_t>
1041
+ SortPairs(void* d_temp_storage,
1042
+ size_t& temp_storage_bytes,
1043
+ DoubleBuffer<KeyT>& d_keys,
1044
+ DoubleBuffer<ValueT>& d_values,
1045
+ NumItemsT num_items,
1046
+ DecomposerT decomposer,
1047
+ int begin_bit,
1048
+ int end_bit,
1049
+ cudaStream_t stream = 0)
1050
+ {
1051
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1052
+
1053
+ // unsigned integer type for global offsets
1054
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1055
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1056
+
1057
+ static_assert(decomposer_check_t::value,
1058
+ "DecomposerT must be a callable object returning a tuple of references to "
1059
+ "arithmetic types");
1060
+
1061
+ constexpr bool is_overwrite_okay = true;
1062
+
1063
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
1064
+ decomposer_check_t{},
1065
+ d_temp_storage,
1066
+ temp_storage_bytes,
1067
+ is_overwrite_okay,
1068
+ d_keys,
1069
+ d_values,
1070
+ static_cast<offset_t>(num_items),
1071
+ decomposer,
1072
+ begin_bit,
1073
+ end_bit,
1074
+ stream);
1075
+ }
1076
+
1077
+ //! @rst
1078
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1079
+ //!
1080
+ //! - The contents of the input data are not altered by the sorting operation.
1081
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1082
+ //! supported.
1083
+ //! - In-place operations are not supported. There must be no overlap between
1084
+ //! any of the provided ranges:
1085
+ //!
1086
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1087
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1088
+ //! - ``[d_values_in, d_values_in + num_items)``
1089
+ //! - ``[d_values_out, d_values_out + num_items)``
1090
+ //!
1091
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1092
+ //! bits can be specified. This can reduce overall sorting overhead and
1093
+ //! yield a corresponding performance improvement.
1094
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1095
+ //! the sorting interface using DoubleBuffer wrappers below.
1096
+ //! - @devicestorage
1097
+ //!
1098
+ //! Snippet
1099
+ //! --------------------------------------------------
1100
+ //!
1101
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1102
+ //! keys with associated vector of ``int`` values.
1103
+ //! @endrst
1104
+ //!
1105
+ //! @code{.cpp}
1106
+ //! #include <cub/cub.cuh>
1107
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1108
+ //!
1109
+ //! // Declare, allocate, and initialize device-accessible pointers
1110
+ //! // for sorting data
1111
+ //! int num_items; // e.g., 7
1112
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1113
+ //! int *d_keys_out; // e.g., [ ... ]
1114
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1115
+ //! int *d_values_out; // e.g., [ ... ]
1116
+ //! ...
1117
+ //!
1118
+ //! // Determine temporary device storage requirements
1119
+ //! void *d_temp_storage = nullptr;
1120
+ //! size_t temp_storage_bytes = 0;
1121
+ //! cub::DeviceRadixSort::SortPairsDescending(
1122
+ //! d_temp_storage, temp_storage_bytes,
1123
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1124
+ //!
1125
+ //! // Allocate temporary storage
1126
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1127
+ //!
1128
+ //! // Run sorting operation
1129
+ //! cub::DeviceRadixSort::SortPairsDescending(
1130
+ //! d_temp_storage, temp_storage_bytes,
1131
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1132
+ //!
1133
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
1134
+ //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
1135
+ //! @endcode
1136
+ //!
1137
+ //! @tparam KeyT
1138
+ //! **[inferred]** KeyT type
1139
+ //!
1140
+ //! @tparam ValueT
1141
+ //! **[inferred]** ValueT type
1142
+ //!
1143
+ //! @tparam NumItemsT
1144
+ //! **[inferred]** Type of num_items
1145
+ //!
1146
+ //! @param[in] d_temp_storage
1147
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1148
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1149
+ //! is done.
1150
+ //!
1151
+ //! @param[in,out] temp_storage_bytes
1152
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1153
+ //!
1154
+ //! @param[in] d_keys_in
1155
+ //! Pointer to the input data of key data to sort
1156
+ //!
1157
+ //! @param[out] d_keys_out
1158
+ //! Pointer to the sorted output sequence of key data
1159
+ //!
1160
+ //! @param[in] d_values_in
1161
+ //! Pointer to the corresponding input sequence of associated value items
1162
+ //!
1163
+ //! @param[out] d_values_out
1164
+ //! Pointer to the correspondingly-reordered output sequence of associated
1165
+ //! value items
1166
+ //!
1167
+ //! @param[in] num_items
1168
+ //! Number of items to sort
1169
+ //!
1170
+ //! @param[in] begin_bit
1171
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1172
+ //! key comparison
1173
+ //!
1174
+ //! @param[in] end_bit
1175
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1176
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1177
+ //!
1178
+ //! @param[in] stream
1179
+ //! **[optional]** CUDA stream to launch kernels within.
1180
+ //! Default is stream<sub>0</sub>.
1181
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1182
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1183
+ void* d_temp_storage,
1184
+ size_t& temp_storage_bytes,
1185
+ const KeyT* d_keys_in,
1186
+ KeyT* d_keys_out,
1187
+ const ValueT* d_values_in,
1188
+ ValueT* d_values_out,
1189
+ NumItemsT num_items,
1190
+ int begin_bit = 0,
1191
+ int end_bit = sizeof(KeyT) * 8,
1192
+ cudaStream_t stream = 0)
1193
+ {
1194
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1195
+
1196
+ // Unsigned integer type for global offsets.
1197
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1198
+
1199
+ // We cast away const-ness, but will *not* write to these arrays.
1200
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1201
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1202
+ // is not set.
1203
+ constexpr bool is_overwrite_okay = false;
1204
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1205
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1206
+
1207
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1208
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1209
+ }
1210
+
1211
+ //! @rst
1212
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1213
+ //!
1214
+ //! * The contents of the input data are not altered by the sorting operation.
1215
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1216
+ //! supported.
1217
+ //! * In-place operations are not supported. There must be no overlap between
1218
+ //! any of the provided ranges:
1219
+ //!
1220
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1221
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1222
+ //! * ``[d_values_in, d_values_in + num_items)``
1223
+ //! * ``[d_values_out, d_values_out + num_items)``
1224
+ //!
1225
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
1226
+ //! differentiating key bits. This can reduce overall sorting overhead and
1227
+ //! yield a corresponding performance improvement.
1228
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1229
+ //! the sorting interface using DoubleBuffer wrappers below.
1230
+ //! * @devicestorage
1231
+ //!
1232
+ //! Snippet
1233
+ //! --------------------------------------------------
1234
+ //!
1235
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1236
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1237
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1238
+ //! tuple of references to relevant members of the key.
1239
+ //!
1240
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1241
+ //! :language: c++
1242
+ //! :dedent:
1243
+ //! :start-after: example-begin custom-type
1244
+ //! :end-before: example-end custom-type
1245
+ //!
1246
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1247
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1248
+ //!
1249
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1250
+ //! :language: c++
1251
+ //! :dedent:
1252
+ //! :start-after: example-begin pairs-descending-bits
1253
+ //! :end-before: example-end pairs-descending-bits
1254
+ //!
1255
+ //! @endrst
1256
+ //!
1257
+ //! @tparam KeyT
1258
+ //! **[inferred]** KeyT type
1259
+ //!
1260
+ //! @tparam ValueT
1261
+ //! **[inferred]** ValueT type
1262
+ //!
1263
+ //! @tparam NumItemsT
1264
+ //! **[inferred]** Type of num_items
1265
+ //!
1266
+ //! @tparam DecomposerT
1267
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1268
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1269
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1270
+ //! The leftmost element of the tuple is considered the most significant.
1271
+ //! The call operator must not modify members of the key.
1272
+ //!
1273
+ //! @param[in] d_temp_storage
1274
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1275
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1276
+ //! is done.
1277
+ //!
1278
+ //! @param[in,out] temp_storage_bytes
1279
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1280
+ //!
1281
+ //! @param[in] d_keys_in
1282
+ //! Pointer to the input data of key data to sort
1283
+ //!
1284
+ //! @param[out] d_keys_out
1285
+ //! Pointer to the sorted output sequence of key data
1286
+ //!
1287
+ //! @param[in] d_values_in
1288
+ //! Pointer to the corresponding input sequence of associated value items
1289
+ //!
1290
+ //! @param[out] d_values_out
1291
+ //! Pointer to the correspondingly-reordered output sequence of associated
1292
+ //! value items
1293
+ //!
1294
+ //! @param[in] num_items
1295
+ //! Number of items to sort
1296
+ //!
1297
+ //! @param decomposer
1298
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1299
+ //! references to its constituent arithmetic types. The leftmost element of
1300
+ //! the tuple is considered the most significant. The call operator must not
1301
+ //! modify members of the key.
1302
+ //!
1303
+ //! @param[in] begin_bit
1304
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1305
+ //! key comparison
1306
+ //!
1307
+ //! @param[in] end_bit
1308
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1309
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1310
+ //!
1311
+ //! @param[in] stream
1312
+ //! **[optional]** CUDA stream to launch kernels within.
1313
+ //! Default is stream<sub>0</sub>.
1314
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1315
+ CUB_RUNTIME_FUNCTION static //
1316
+ ::cuda::std::enable_if_t< //
1317
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1318
+ cudaError_t>
1319
+ SortPairsDescending(
1320
+ void* d_temp_storage,
1321
+ size_t& temp_storage_bytes,
1322
+ const KeyT* d_keys_in,
1323
+ KeyT* d_keys_out,
1324
+ const ValueT* d_values_in,
1325
+ ValueT* d_values_out,
1326
+ NumItemsT num_items,
1327
+ DecomposerT decomposer,
1328
+ int begin_bit,
1329
+ int end_bit,
1330
+ cudaStream_t stream = 0)
1331
+ {
1332
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1333
+
1334
+ // unsigned integer type for global offsets
1335
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1336
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1337
+
1338
+ static_assert(decomposer_check_t::value,
1339
+ "DecomposerT must be a callable object returning a tuple of references to "
1340
+ "arithmetic types");
1341
+
1342
+ // We cast away const-ness, but will *not* write to these arrays.
1343
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1344
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1345
+ // is not set.
1346
+ constexpr bool is_overwrite_okay = false;
1347
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1348
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1349
+
1350
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1351
+ decomposer_check_t{},
1352
+ d_temp_storage,
1353
+ temp_storage_bytes,
1354
+ is_overwrite_okay,
1355
+ d_keys,
1356
+ d_values,
1357
+ static_cast<offset_t>(num_items),
1358
+ decomposer,
1359
+ begin_bit,
1360
+ end_bit,
1361
+ stream);
1362
+ }
1363
+
1364
+ //! @rst
1365
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1366
+ //!
1367
+ //! * The contents of the input data are not altered by the sorting operation.
1368
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1369
+ //! supported.
1370
+ //! * In-place operations are not supported. There must be no overlap between
1371
+ //! any of the provided ranges:
1372
+ //!
1373
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1374
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1375
+ //! * ``[d_values_in, d_values_in + num_items)``
1376
+ //! * ``[d_values_out, d_values_out + num_items)``
1377
+ //!
1378
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1379
+ //! the sorting interface using DoubleBuffer wrappers below.
1380
+ //! * @devicestorage
1381
+ //!
1382
+ //! Snippet
1383
+ //! --------------------------------------------------
1384
+ //!
1385
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1386
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1387
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1388
+ //! tuple of references to relevant members of the key.
1389
+ //!
1390
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1391
+ //! :language: c++
1392
+ //! :dedent:
1393
+ //! :start-after: example-begin custom-type
1394
+ //! :end-before: example-end custom-type
1395
+ //!
1396
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1397
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1398
+ //!
1399
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1400
+ //! :language: c++
1401
+ //! :dedent:
1402
+ //! :start-after: example-begin pairs-descending
1403
+ //! :end-before: example-end pairs-descending
1404
+ //!
1405
+ //! @endrst
1406
+ //!
1407
+ //! @tparam KeyT
1408
+ //! **[inferred]** KeyT type
1409
+ //!
1410
+ //! @tparam ValueT
1411
+ //! **[inferred]** ValueT type
1412
+ //!
1413
+ //! @tparam NumItemsT
1414
+ //! **[inferred]** Type of num_items
1415
+ //!
1416
+ //! @tparam DecomposerT
1417
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1418
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1419
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1420
+ //! The leftmost element of the tuple is considered the most significant.
1421
+ //! The call operator must not modify members of the key.
1422
+ //!
1423
+ //! @param[in] d_temp_storage
1424
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1425
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1426
+ //! is done.
1427
+ //!
1428
+ //! @param[in,out] temp_storage_bytes
1429
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1430
+ //!
1431
+ //! @param[in] d_keys_in
1432
+ //! Pointer to the input data of key data to sort
1433
+ //!
1434
+ //! @param[out] d_keys_out
1435
+ //! Pointer to the sorted output sequence of key data
1436
+ //!
1437
+ //! @param[in] d_values_in
1438
+ //! Pointer to the corresponding input sequence of associated value items
1439
+ //!
1440
+ //! @param[out] d_values_out
1441
+ //! Pointer to the correspondingly-reordered output sequence of associated
1442
+ //! value items
1443
+ //!
1444
+ //! @param[in] num_items
1445
+ //! Number of items to sort
1446
+ //!
1447
+ //! @param decomposer
1448
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1449
+ //! references to its constituent arithmetic types. The leftmost element of
1450
+ //! the tuple is considered the most significant. The call operator must not
1451
+ //! modify members of the key.
1452
+ //!
1453
+ //! @param[in] stream
1454
+ //! **[optional]** CUDA stream to launch kernels within.
1455
+ //! Default is stream<sub>0</sub>.
1456
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1457
+ CUB_RUNTIME_FUNCTION static //
1458
+ ::cuda::std::enable_if_t< //
1459
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1460
+ cudaError_t>
1461
+ SortPairsDescending(
1462
+ void* d_temp_storage,
1463
+ size_t& temp_storage_bytes,
1464
+ const KeyT* d_keys_in,
1465
+ KeyT* d_keys_out,
1466
+ const ValueT* d_values_in,
1467
+ ValueT* d_values_out,
1468
+ NumItemsT num_items,
1469
+ DecomposerT decomposer,
1470
+ cudaStream_t stream = 0)
1471
+ {
1472
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1473
+
1474
+ // unsigned integer type for global offsets
1475
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1476
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1477
+
1478
+ static_assert(decomposer_check_t::value,
1479
+ "DecomposerT must be a callable object returning a tuple of references to "
1480
+ "arithmetic types");
1481
+
1482
+ // We cast away const-ness, but will *not* write to these arrays.
1483
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1484
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1485
+ // is not set.
1486
+ constexpr bool is_overwrite_okay = false;
1487
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1488
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1489
+
1490
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1491
+ decomposer_check_t{},
1492
+ d_temp_storage,
1493
+ temp_storage_bytes,
1494
+ is_overwrite_okay,
1495
+ d_keys,
1496
+ d_values,
1497
+ static_cast<offset_t>(num_items),
1498
+ decomposer,
1499
+ stream);
1500
+ }
1501
+
1502
+ //! @rst
1503
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1504
+ //!
1505
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1506
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1507
+ //! structure that indicates which of the two buffers is "current" (and thus
1508
+ //! contains the input data to be sorted).
1509
+ //! - The contents of both buffers within each pair may be altered by the
1510
+ //! sorting operation.
1511
+ //! - In-place operations are not supported. There must be no overlap between
1512
+ //! any of the provided ranges:
1513
+ //!
1514
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1515
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1516
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1517
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1518
+ //!
1519
+ //! - Upon completion, the sorting operation will update the "current"
1520
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1521
+ //! buffers now contains the sorted output sequence (a function of the number
1522
+ //! of key bits specified and the targeted device architecture).
1523
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1524
+ //! bits can be specified. This can reduce overall sorting overhead and
1525
+ //! yield a corresponding performance improvement.
1526
+ //! - @devicestorageP
1527
+ //! - @devicestorage
1528
+ //!
1529
+ //! Snippet
1530
+ //! --------------------------------------------------
1531
+ //!
1532
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1533
+ //! keys with associated vector of ``int`` values.
1534
+ //! @endrst
1535
+ //!
1536
+ //! @code{.cpp}
1537
+ //! #include <cub/cub.cuh>
1538
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1539
+ //!
1540
+ //! // Declare, allocate, and initialize device-accessible pointers
1541
+ //! // for sorting data
1542
+ //! int num_items; // e.g., 7
1543
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1544
+ //! int *d_key_alt_buf; // e.g., [ ... ]
1545
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1546
+ //! int *d_value_alt_buf; // e.g., [ ... ]
1547
+ //! ...
1548
+ //!
1549
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1550
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1551
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1552
+ //!
1553
+ //! // Determine temporary device storage requirements
1554
+ //! void *d_temp_storage = nullptr;
1555
+ //! size_t temp_storage_bytes = 0;
1556
+ //! cub::DeviceRadixSort::SortPairsDescending(
1557
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1558
+ //!
1559
+ //! // Allocate temporary storage
1560
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1561
+ //!
1562
+ //! // Run sorting operation
1563
+ //! cub::DeviceRadixSort::SortPairsDescending(
1564
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1565
+ //!
1566
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
1567
+ //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
1568
+ //! @endcode
1569
+ //!
1570
+ //! @tparam KeyT
1571
+ //! **[inferred]** KeyT type
1572
+ //!
1573
+ //! @tparam ValueT
1574
+ //! **[inferred]** ValueT type
1575
+ //!
1576
+ //! @tparam NumItemsT
1577
+ //! **[inferred]** Type of num_items
1578
+ //!
1579
+ //! @param[in] d_temp_storage
1580
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1581
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1582
+ //! is done.
1583
+ //!
1584
+ //! @param[in,out] temp_storage_bytes
1585
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1586
+ //!
1587
+ //! @param[in,out] d_keys
1588
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1589
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1590
+ //! point to the sorted output keys
1591
+ //!
1592
+ //! @param[in,out] d_values
1593
+ //! Double-buffer of values whose "current" device-accessible buffer
1594
+ //! contains the unsorted input values and, upon return, is updated to point
1595
+ //! to the sorted output values
1596
+ //!
1597
+ //! @param[in] num_items
1598
+ //! Number of items to sort
1599
+ //!
1600
+ //! @param[in] begin_bit
1601
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1602
+ //! key comparison
1603
+ //!
1604
+ //! @param[in] end_bit
1605
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1606
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1607
+ //!
1608
+ //! @param[in] stream
1609
+ //! **[optional]** CUDA stream to launch kernels within.
1610
+ //! Default is stream<sub>0</sub>.
1611
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1612
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1613
+ void* d_temp_storage,
1614
+ size_t& temp_storage_bytes,
1615
+ DoubleBuffer<KeyT>& d_keys,
1616
+ DoubleBuffer<ValueT>& d_values,
1617
+ NumItemsT num_items,
1618
+ int begin_bit = 0,
1619
+ int end_bit = sizeof(KeyT) * 8,
1620
+ cudaStream_t stream = 0)
1621
+ {
1622
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1623
+
1624
+ // Unsigned integer type for global offsets.
1625
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1626
+
1627
+ constexpr bool is_overwrite_okay = true;
1628
+
1629
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1630
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1631
+ }
1632
+
1633
+ //! @rst
1634
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1635
+ //!
1636
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1637
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1638
+ //! structure that indicates which of the two buffers is "current" (and thus
1639
+ //! contains the input data to be sorted).
1640
+ //! * The contents of both buffers within each pair may be altered by the
1641
+ //! sorting operation.
1642
+ //! * In-place operations are not supported. There must be no overlap between
1643
+ //! any of the provided ranges:
1644
+ //!
1645
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1646
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1647
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1648
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1649
+ //!
1650
+ //! - Upon completion, the sorting operation will update the "current"
1651
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1652
+ //! buffers now contains the sorted output sequence (a function of the
1653
+ //! number of key bits specified and the targeted device architecture).
1654
+ //! - @devicestorageP
1655
+ //! - @devicestorage
1656
+ //!
1657
+ //! Snippet
1658
+ //! --------------------------------------------------
1659
+ //!
1660
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1661
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1662
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1663
+ //! tuple of references to relevant members of the key.
1664
+ //!
1665
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1666
+ //! :language: c++
1667
+ //! :dedent:
1668
+ //! :start-after: example-begin custom-type
1669
+ //! :end-before: example-end custom-type
1670
+ //!
1671
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1672
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1673
+ //!
1674
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1675
+ //! :language: c++
1676
+ //! :dedent:
1677
+ //! :start-after: example-begin pairs-descending-db
1678
+ //! :end-before: example-end pairs-descending-db
1679
+ //!
1680
+ //! @endrst
1681
+ //!
1682
+ //! @tparam KeyT
1683
+ //! **[inferred]** KeyT type
1684
+ //!
1685
+ //! @tparam ValueT
1686
+ //! **[inferred]** ValueT type
1687
+ //!
1688
+ //! @tparam NumItemsT
1689
+ //! **[inferred]** Type of num_items
1690
+ //!
1691
+ //! @tparam DecomposerT
1692
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1693
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1694
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1695
+ //! The leftmost element of the tuple is considered the most significant.
1696
+ //! The call operator must not modify members of the key.
1697
+ //!
1698
+ //! @param[in] d_temp_storage
1699
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1700
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1701
+ //! is done.
1702
+ //!
1703
+ //! @param[in,out] temp_storage_bytes
1704
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1705
+ //!
1706
+ //! @param[in,out] d_keys
1707
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1708
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1709
+ //! point to the sorted output keys
1710
+ //!
1711
+ //! @param[in,out] d_values
1712
+ //! Double-buffer of values whose "current" device-accessible buffer
1713
+ //! contains the unsorted input values and, upon return, is updated to point
1714
+ //! to the sorted output values
1715
+ //!
1716
+ //! @param[in] num_items
1717
+ //! Number of items to sort
1718
+ //!
1719
+ //! @param decomposer
1720
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1721
+ //! references to its constituent arithmetic types. The leftmost element of
1722
+ //! the tuple is considered the most significant. The call operator must not
1723
+ //! modify members of the key.
1724
+ //!
1725
+ //! @param[in] stream
1726
+ //! **[optional]** CUDA stream to launch kernels within.
1727
+ //! Default is stream<sub>0</sub>.
1728
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1729
+ CUB_RUNTIME_FUNCTION static //
1730
+ ::cuda::std::enable_if_t< //
1731
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1732
+ cudaError_t>
1733
+ SortPairsDescending(
1734
+ void* d_temp_storage,
1735
+ size_t& temp_storage_bytes,
1736
+ DoubleBuffer<KeyT>& d_keys,
1737
+ DoubleBuffer<ValueT>& d_values,
1738
+ NumItemsT num_items,
1739
+ DecomposerT decomposer,
1740
+ cudaStream_t stream = 0)
1741
+ {
1742
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1743
+
1744
+ // unsigned integer type for global offsets
1745
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1746
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1747
+
1748
+ static_assert(decomposer_check_t::value,
1749
+ "DecomposerT must be a callable object returning a tuple of references to "
1750
+ "arithmetic types");
1751
+
1752
+ constexpr bool is_overwrite_okay = true;
1753
+
1754
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1755
+ decomposer_check_t{},
1756
+ d_temp_storage,
1757
+ temp_storage_bytes,
1758
+ is_overwrite_okay,
1759
+ d_keys,
1760
+ d_values,
1761
+ static_cast<offset_t>(num_items),
1762
+ decomposer,
1763
+ stream);
1764
+ }
1765
+
1766
+ //! @rst
1767
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1768
+ //!
1769
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1770
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1771
+ //! structure that indicates which of the two buffers is "current" (and thus
1772
+ //! contains the input data to be sorted).
1773
+ //! * The contents of both buffers within each pair may be altered by the
1774
+ //! sorting operation.
1775
+ //! * In-place operations are not supported. There must be no overlap between
1776
+ //! any of the provided ranges:
1777
+ //!
1778
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1779
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1780
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1781
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1782
+ //!
1783
+ //! - Upon completion, the sorting operation will update the "current"
1784
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1785
+ //! buffers now contains the sorted output sequence (a function of the
1786
+ //! number of key bits specified and the targeted device architecture).
1787
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1788
+ //! bits can be specified. This can reduce overall sorting overhead and
1789
+ //! yield a corresponding performance improvement.
1790
+ //! - @devicestorageP
1791
+ //! - @devicestorage
1792
+ //!
1793
+ //! Snippet
1794
+ //! --------------------------------------------------
1795
+ //!
1796
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1797
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1798
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1799
+ //! tuple of references to relevant members of the key.
1800
+ //!
1801
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1802
+ //! :language: c++
1803
+ //! :dedent:
1804
+ //! :start-after: example-begin custom-type
1805
+ //! :end-before: example-end custom-type
1806
+ //!
1807
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1808
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1809
+ //!
1810
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1811
+ //! :language: c++
1812
+ //! :dedent:
1813
+ //! :start-after: example-begin pairs-descending-bits-db
1814
+ //! :end-before: example-end pairs-descending-bits-db
1815
+ //!
1816
+ //! @endrst
1817
+ //!
1818
+ //! @tparam KeyT
1819
+ //! **[inferred]** KeyT type
1820
+ //!
1821
+ //! @tparam ValueT
1822
+ //! **[inferred]** ValueT type
1823
+ //!
1824
+ //! @tparam NumItemsT
1825
+ //! **[inferred]** Type of num_items
1826
+ //!
1827
+ //! @tparam DecomposerT
1828
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1829
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1830
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1831
+ //! The leftmost element of the tuple is considered the most significant.
1832
+ //! The call operator must not modify members of the key.
1833
+ //!
1834
+ //! @param[in] d_temp_storage
1835
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1836
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1837
+ //! is done.
1838
+ //!
1839
+ //! @param[in,out] temp_storage_bytes
1840
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1841
+ //!
1842
+ //! @param[in,out] d_keys
1843
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1844
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1845
+ //! point to the sorted output keys
1846
+ //!
1847
+ //! @param[in,out] d_values
1848
+ //! Double-buffer of values whose "current" device-accessible buffer
1849
+ //! contains the unsorted input values and, upon return, is updated to point
1850
+ //! to the sorted output values
1851
+ //!
1852
+ //! @param[in] num_items
1853
+ //! Number of items to sort
1854
+ //!
1855
+ //! @param decomposer
1856
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1857
+ //! references to its constituent arithmetic types. The leftmost element of
1858
+ //! the tuple is considered the most significant. The call operator must not
1859
+ //! modify members of the key.
1860
+ //!
1861
+ //! @param[in] begin_bit
1862
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1863
+ //! key comparison
1864
+ //!
1865
+ //! @param[in] end_bit
1866
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1867
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1868
+ //!
1869
+ //! @param[in] stream
1870
+ //! **[optional]** CUDA stream to launch kernels within.
1871
+ //! Default is stream<sub>0</sub>.
1872
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1873
+ CUB_RUNTIME_FUNCTION static //
1874
+ ::cuda::std::enable_if_t< //
1875
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1876
+ cudaError_t>
1877
+ SortPairsDescending(
1878
+ void* d_temp_storage,
1879
+ size_t& temp_storage_bytes,
1880
+ DoubleBuffer<KeyT>& d_keys,
1881
+ DoubleBuffer<ValueT>& d_values,
1882
+ NumItemsT num_items,
1883
+ DecomposerT decomposer,
1884
+ int begin_bit,
1885
+ int end_bit,
1886
+ cudaStream_t stream = 0)
1887
+ {
1888
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1889
+
1890
+ // unsigned integer type for global offsets
1891
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1892
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1893
+
1894
+ static_assert(decomposer_check_t::value,
1895
+ "DecomposerT must be a callable object returning a tuple of references to "
1896
+ "arithmetic types");
1897
+
1898
+ constexpr bool is_overwrite_okay = true;
1899
+
1900
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1901
+ decomposer_check_t{},
1902
+ d_temp_storage,
1903
+ temp_storage_bytes,
1904
+ is_overwrite_okay,
1905
+ d_keys,
1906
+ d_values,
1907
+ static_cast<offset_t>(num_items),
1908
+ decomposer,
1909
+ begin_bit,
1910
+ end_bit,
1911
+ stream);
1912
+ }
1913
+
1914
+ //! @} end member group
1915
+ //! @name Keys-only
1916
+ //! @{
1917
+
1918
+ //! @rst
1919
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
1920
+ //!
1921
+ //! - The contents of the input data are not altered by the sorting operation.
1922
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1923
+ //! supported.
1924
+ //! - In-place operations are not supported. There must be no overlap between
1925
+ //! any of the provided ranges:
1926
+ //!
1927
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1928
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1929
+ //!
1930
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1931
+ //! bits can be specified. This can reduce overall sorting overhead and
1932
+ //! yield a corresponding performance improvement.
1933
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1934
+ //! the sorting interface using DoubleBuffer wrappers below.
1935
+ //! - @devicestorage
1936
+ //!
1937
+ //! Snippet
1938
+ //! --------------------------------------------------
1939
+ //!
1940
+ //! The code snippet below illustrates the sorting of a device vector of
1941
+ //! ``int`` keys.
1942
+ //! @endrst
1943
+ //!
1944
+ //! @code{.cpp}
1945
+ //! #include <cub/cub.cuh>
1946
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1947
+ //!
1948
+ //! // Declare, allocate, and initialize device-accessible pointers
1949
+ //! // for sorting data
1950
+ //! int num_items; // e.g., 7
1951
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1952
+ //! int *d_keys_out; // e.g., [ ... ]
1953
+ //! ...
1954
+ //!
1955
+ //! // Determine temporary device storage requirements
1956
+ //! void *d_temp_storage = nullptr;
1957
+ //! size_t temp_storage_bytes = 0;
1958
+ //! cub::DeviceRadixSort::SortKeys(
1959
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1960
+ //!
1961
+ //! // Allocate temporary storage
1962
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1963
+ //!
1964
+ //! // Run sorting operation
1965
+ //! cub::DeviceRadixSort::SortKeys(
1966
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1967
+ //!
1968
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
1969
+ //! @endcode
1970
+ //!
1971
+ //! @tparam KeyT
1972
+ //! **[inferred]** KeyT type
1973
+ //!
1974
+ //! @tparam NumItemsT
1975
+ //! **[inferred]** Type of num_items
1976
+ //!
1977
+ //! @tparam NumItemsT
1978
+ //! **[inferred]** Type of num_items
1979
+ //!
1980
+ //! @param[in] d_temp_storage
1981
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1982
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1983
+ //! is done.
1984
+ //!
1985
+ //! @param[in,out] temp_storage_bytes
1986
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1987
+ //!
1988
+ //! @param[in] d_keys_in
1989
+ //! Pointer to the input data of key data to sort
1990
+ //!
1991
+ //! @param[out] d_keys_out
1992
+ //! Pointer to the sorted output sequence of key data
1993
+ //!
1994
+ //! @param[in] num_items
1995
+ //! Number of items to sort
1996
+ //!
1997
+ //! @param[in] begin_bit
1998
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1999
+ //! key comparison
2000
+ //!
2001
+ //! @param[in] end_bit
2002
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2003
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2004
+ //!
2005
+ //! @param[in] stream
2006
+ //! **[optional]** CUDA stream to launch kernels within.
2007
+ //! Default is stream<sub>0</sub>.
2008
+ template <typename KeyT, typename NumItemsT>
2009
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2010
+ void* d_temp_storage,
2011
+ size_t& temp_storage_bytes,
2012
+ const KeyT* d_keys_in,
2013
+ KeyT* d_keys_out,
2014
+ NumItemsT num_items,
2015
+ int begin_bit = 0,
2016
+ int end_bit = sizeof(KeyT) * 8,
2017
+ cudaStream_t stream = 0)
2018
+ {
2019
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2020
+
2021
+ // Unsigned integer type for global offsets.
2022
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2023
+
2024
+ // We cast away const-ness, but will *not* write to these arrays.
2025
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2026
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2027
+ // is not set.
2028
+ constexpr bool is_overwrite_okay = false;
2029
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2030
+ // Null value type
2031
+ DoubleBuffer<NullType> d_values;
2032
+
2033
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2034
+ d_temp_storage,
2035
+ temp_storage_bytes,
2036
+ d_keys,
2037
+ d_values,
2038
+ static_cast<OffsetT>(num_items),
2039
+ begin_bit,
2040
+ end_bit,
2041
+ is_overwrite_okay,
2042
+ stream);
2043
+ }
2044
+
2045
+ //! @rst
2046
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2047
+ //!
2048
+ //! * The contents of the input data are not altered by the sorting operation.
2049
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2050
+ //! supported.
2051
+ //! * In-place operations are not supported. There must be no overlap between
2052
+ //! any of the provided ranges:
2053
+ //!
2054
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2055
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2056
+ //!
2057
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2058
+ //! differentiating key bits. This can reduce overall sorting overhead and
2059
+ //! yield a corresponding performance improvement.
2060
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2061
+ //! the sorting interface using DoubleBuffer wrappers below.
2062
+ //! * @devicestorage
2063
+ //!
2064
+ //! Snippet
2065
+ //! --------------------------------------------------
2066
+ //!
2067
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2068
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2069
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2070
+ //! tuple of references to relevant members of the key.
2071
+ //!
2072
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2073
+ //! :language: c++
2074
+ //! :dedent:
2075
+ //! :start-after: example-begin custom-type
2076
+ //! :end-before: example-end custom-type
2077
+ //!
2078
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2079
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2080
+ //!
2081
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2082
+ //! :language: c++
2083
+ //! :dedent:
2084
+ //! :start-after: example-begin keys-bits
2085
+ //! :end-before: example-end keys-bits
2086
+ //!
2087
+ //! @endrst
2088
+ //!
2089
+ //! @tparam KeyT
2090
+ //! **[inferred]** KeyT type
2091
+ //!
2092
+ //! @tparam NumItemsT
2093
+ //! **[inferred]** Type of num_items
2094
+ //!
2095
+ //! @tparam DecomposerT
2096
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2097
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2098
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2099
+ //! The leftmost element of the tuple is considered the most significant.
2100
+ //! The call operator must not modify members of the key.
2101
+ //!
2102
+ //! @param[in] d_temp_storage
2103
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2104
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2105
+ //! is done.
2106
+ //!
2107
+ //! @param[in,out] temp_storage_bytes
2108
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2109
+ //!
2110
+ //! @param[in] d_keys_in
2111
+ //! Pointer to the input data of key data to sort
2112
+ //!
2113
+ //! @param[out] d_keys_out
2114
+ //! Pointer to the sorted output sequence of key data
2115
+ //!
2116
+ //! @param[in] num_items
2117
+ //! Number of items to sort
2118
+ //!
2119
+ //! @param decomposer
2120
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2121
+ //! references to its constituent arithmetic types. The leftmost element of
2122
+ //! the tuple is considered the most significant. The call operator must not
2123
+ //! modify members of the key.
2124
+ //!
2125
+ //! @param[in] begin_bit
2126
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2127
+ //! key comparison
2128
+ //!
2129
+ //! @param[in] end_bit
2130
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2131
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2132
+ //!
2133
+ //! @param[in] stream
2134
+ //! **[optional]** CUDA stream to launch kernels within.
2135
+ //! Default is stream<sub>0</sub>.
2136
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2137
+ CUB_RUNTIME_FUNCTION static //
2138
+ ::cuda::std::enable_if_t< //
2139
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2140
+ cudaError_t>
2141
+ SortKeys(void* d_temp_storage,
2142
+ size_t& temp_storage_bytes,
2143
+ const KeyT* d_keys_in,
2144
+ KeyT* d_keys_out,
2145
+ NumItemsT num_items,
2146
+ DecomposerT decomposer,
2147
+ int begin_bit,
2148
+ int end_bit,
2149
+ cudaStream_t stream = 0)
2150
+ {
2151
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2152
+
2153
+ // unsigned integer type for global offsets
2154
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2155
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2156
+
2157
+ static_assert(decomposer_check_t::value,
2158
+ "DecomposerT must be a callable object returning a tuple of references to "
2159
+ "arithmetic types");
2160
+
2161
+ // We cast away const-ness, but will *not* write to these arrays.
2162
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2163
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2164
+ // is not set.
2165
+ constexpr bool is_overwrite_okay = false;
2166
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2167
+ DoubleBuffer<NullType> d_values;
2168
+
2169
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2170
+ decomposer_check_t{},
2171
+ d_temp_storage,
2172
+ temp_storage_bytes,
2173
+ is_overwrite_okay,
2174
+ d_keys,
2175
+ d_values,
2176
+ static_cast<offset_t>(num_items),
2177
+ decomposer,
2178
+ begin_bit,
2179
+ end_bit,
2180
+ stream);
2181
+ }
2182
+
2183
+ //! @rst
2184
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2185
+ //!
2186
+ //! * The contents of the input data are not altered by the sorting operation.
2187
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2188
+ //! supported.
2189
+ //! * In-place operations are not supported. There must be no overlap between
2190
+ //! any of the provided ranges:
2191
+ //!
2192
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2193
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2194
+ //!
2195
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2196
+ //! bits can be specified. This can reduce overall sorting overhead and
2197
+ //! yield a corresponding performance improvement.
2198
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2199
+ //! the sorting interface using DoubleBuffer wrappers below.
2200
+ //! * @devicestorage
2201
+ //!
2202
+ //! Snippet
2203
+ //! --------------------------------------------------
2204
+ //!
2205
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2206
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2207
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2208
+ //! tuple of references to relevant members of the key.
2209
+ //!
2210
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2211
+ //! :language: c++
2212
+ //! :dedent:
2213
+ //! :start-after: example-begin custom-type
2214
+ //! :end-before: example-end custom-type
2215
+ //!
2216
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2217
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2218
+ //!
2219
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2220
+ //! :language: c++
2221
+ //! :dedent:
2222
+ //! :start-after: example-begin keys
2223
+ //! :end-before: example-end keys
2224
+ //!
2225
+ //! @endrst
2226
+ //!
2227
+ //! @tparam KeyT
2228
+ //! **[inferred]** KeyT type
2229
+ //!
2230
+ //! @tparam NumItemsT
2231
+ //! **[inferred]** Type of num_items
2232
+ //!
2233
+ //! @tparam DecomposerT
2234
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2235
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2236
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2237
+ //! The leftmost element of the tuple is considered the most significant.
2238
+ //! The call operator must not modify members of the key.
2239
+ //!
2240
+ //! @param[in] d_temp_storage
2241
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2242
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2243
+ //! is done.
2244
+ //!
2245
+ //! @param[in,out] temp_storage_bytes
2246
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2247
+ //!
2248
+ //! @param[in] d_keys_in
2249
+ //! Pointer to the input data of key data to sort
2250
+ //!
2251
+ //! @param[out] d_keys_out
2252
+ //! Pointer to the sorted output sequence of key data
2253
+ //!
2254
+ //! @param[in] num_items
2255
+ //! Number of items to sort
2256
+ //!
2257
+ //! @param decomposer
2258
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2259
+ //! references to its constituent arithmetic types. The leftmost element of
2260
+ //! the tuple is considered the most significant. The call operator must not
2261
+ //! modify members of the key.
2262
+ //!
2263
+ //! @param[in] stream
2264
+ //! **[optional]** CUDA stream to launch kernels within.
2265
+ //! Default is stream<sub>0</sub>.
2266
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2267
+ CUB_RUNTIME_FUNCTION static //
2268
+ ::cuda::std::enable_if_t< //
2269
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2270
+ cudaError_t>
2271
+ SortKeys(void* d_temp_storage,
2272
+ size_t& temp_storage_bytes,
2273
+ const KeyT* d_keys_in,
2274
+ KeyT* d_keys_out,
2275
+ NumItemsT num_items,
2276
+ DecomposerT decomposer,
2277
+ cudaStream_t stream = 0)
2278
+ {
2279
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2280
+
2281
+ // unsigned integer type for global offsets
2282
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2283
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2284
+
2285
+ static_assert(decomposer_check_t::value,
2286
+ "DecomposerT must be a callable object returning a tuple of references to "
2287
+ "arithmetic types");
2288
+
2289
+ // We cast away const-ness, but will *not* write to these arrays.
2290
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2291
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2292
+ // is not set.
2293
+ constexpr bool is_overwrite_okay = false;
2294
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2295
+ DoubleBuffer<NullType> d_values;
2296
+
2297
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2298
+ decomposer_check_t{},
2299
+ d_temp_storage,
2300
+ temp_storage_bytes,
2301
+ is_overwrite_okay,
2302
+ d_keys,
2303
+ d_values,
2304
+ static_cast<offset_t>(num_items),
2305
+ decomposer,
2306
+ stream);
2307
+ }
2308
+
2309
+ //! @rst
2310
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2311
+ //!
2312
+ //! - The sorting operation is given a pair of key buffers managed by a
2313
+ //! DoubleBuffer structure that indicates which of the two buffers is
2314
+ //! "current" (and thus contains the input data to be sorted).
2315
+ //! - The contents of both buffers may be altered by the sorting operation.
2316
+ //! - In-place operations are not supported. There must be no overlap between
2317
+ //! any of the provided ranges:
2318
+ //!
2319
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
2320
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2321
+ //!
2322
+ //! - Upon completion, the sorting operation will update the "current"
2323
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2324
+ //! buffers now contains the sorted output sequence (a function of the
2325
+ //! number of key bits specified and the targeted device architecture).
2326
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2327
+ //! bits can be specified. This can reduce overall sorting overhead and
2328
+ //! yield a corresponding performance improvement.
2329
+ //! - @devicestorageP
2330
+ //! - @devicestorage
2331
+ //!
2332
+ //! Snippet
2333
+ //! --------------------------------------------------
2334
+ //!
2335
+ //! The code snippet below illustrates the sorting of a device vector of
2336
+ //! ``int`` keys.
2337
+ //! @endrst
2338
+ //!
2339
+ //! @code{.cpp}
2340
+ //! #include <cub/cub.cuh>
2341
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2342
+ //!
2343
+ //! // Declare, allocate, and initialize device-accessible pointers
2344
+ //! // for sorting data
2345
+ //! int num_items; // e.g., 7
2346
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2347
+ //! int *d_key_alt_buf; // e.g., [ ... ]
2348
+ //! ...
2349
+ //!
2350
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2351
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2352
+ //!
2353
+ //! // Determine temporary device storage requirements
2354
+ //! void *d_temp_storage = nullptr;
2355
+ //! size_t temp_storage_bytes = 0;
2356
+ //! cub::DeviceRadixSort::SortKeys(
2357
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2358
+ //!
2359
+ //! // Allocate temporary storage
2360
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2361
+ //!
2362
+ //! // Run sorting operation
2363
+ //! cub::DeviceRadixSort::SortKeys(
2364
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2365
+ //!
2366
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
2367
+ //! @endcode
2368
+ //!
2369
+ //! @tparam KeyT
2370
+ //! **[inferred]** KeyT type
2371
+ //!
2372
+ //! @tparam NumItemsT
2373
+ //! **[inferred]** Type of num_items
2374
+ //!
2375
+ //! @param[in] d_temp_storage
2376
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2377
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2378
+ //! is done.
2379
+ //!
2380
+ //! @param[in,out] temp_storage_bytes
2381
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2382
+ //!
2383
+ //! @param[in,out] d_keys
2384
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2385
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2386
+ //! point to the sorted output keys
2387
+ //!
2388
+ //! @param[in] num_items
2389
+ //! Number of items to sort
2390
+ //!
2391
+ //! @param[in] begin_bit
2392
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2393
+ //! key comparison
2394
+ //!
2395
+ //! @param[in] end_bit
2396
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2397
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2398
+ //!
2399
+ //! @param[in] stream
2400
+ //! **[optional]** CUDA stream to launch kernels within.
2401
+ //! Default is stream<sub>0</sub>.
2402
+ template <typename KeyT, typename NumItemsT>
2403
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2404
+ void* d_temp_storage,
2405
+ size_t& temp_storage_bytes,
2406
+ DoubleBuffer<KeyT>& d_keys,
2407
+ NumItemsT num_items,
2408
+ int begin_bit = 0,
2409
+ int end_bit = sizeof(KeyT) * 8,
2410
+ cudaStream_t stream = 0)
2411
+ {
2412
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2413
+
2414
+ // Unsigned integer type for global offsets.
2415
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2416
+
2417
+ constexpr bool is_overwrite_okay = true;
2418
+
2419
+ // Null value type
2420
+ DoubleBuffer<NullType> d_values;
2421
+
2422
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2423
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2424
+ }
2425
+
2426
+ //! @rst
2427
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2428
+ //!
2429
+ //! * The sorting operation is given a pair of key buffers managed by a
2430
+ //! DoubleBuffer structure that indicates which of the two buffers is
2431
+ //! "current" (and thus contains the input data to be sorted).
2432
+ //! * The contents of both buffers may be altered by the sorting operation.
2433
+ //! * In-place operations are not supported. There must be no overlap between
2434
+ //! any of the provided ranges:
2435
+ //!
2436
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2437
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2438
+ //!
2439
+ //! * Upon completion, the sorting operation will update the "current"
2440
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2441
+ //! buffers now contains the sorted output sequence (a function of the
2442
+ //! number of key bits specified and the targeted device architecture).
2443
+ //! * @devicestorageP
2444
+ //! * @devicestorage
2445
+ //!
2446
+ //! Snippet
2447
+ //! --------------------------------------------------
2448
+ //!
2449
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2450
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2451
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2452
+ //! tuple of references to relevant members of the key.
2453
+ //!
2454
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2455
+ //! :language: c++
2456
+ //! :dedent:
2457
+ //! :start-after: example-begin custom-type
2458
+ //! :end-before: example-end custom-type
2459
+ //!
2460
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2461
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2462
+ //!
2463
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2464
+ //! :language: c++
2465
+ //! :dedent:
2466
+ //! :start-after: example-begin keys-db
2467
+ //! :end-before: example-end keys-db
2468
+ //!
2469
+ //! @endrst
2470
+ //!
2471
+ //! @tparam KeyT
2472
+ //! **[inferred]** KeyT type
2473
+ //!
2474
+ //! @tparam NumItemsT
2475
+ //! **[inferred]** Type of num_items
2476
+ //!
2477
+ //! @tparam DecomposerT
2478
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2479
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2480
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2481
+ //! The leftmost element of the tuple is considered the most significant.
2482
+ //! The call operator must not modify members of the key.
2483
+ //!
2484
+ //! @param[in] d_temp_storage
2485
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2486
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2487
+ //! is done.
2488
+ //!
2489
+ //! @param[in,out] temp_storage_bytes
2490
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2491
+ //!
2492
+ //! @param[in,out] d_keys
2493
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2494
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2495
+ //! point to the sorted output keys
2496
+ //!
2497
+ //! @param[in] num_items
2498
+ //! Number of items to sort
2499
+ //!
2500
+ //! @param decomposer
2501
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2502
+ //! references to its constituent arithmetic types. The leftmost element of
2503
+ //! the tuple is considered the most significant. The call operator must not
2504
+ //! modify members of the key.
2505
+ //!
2506
+ //! @param[in] stream
2507
+ //! **[optional]** CUDA stream to launch kernels within.
2508
+ //! Default is stream<sub>0</sub>.
2509
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2510
+ CUB_RUNTIME_FUNCTION static //
2511
+ ::cuda::std::enable_if_t< //
2512
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2513
+ cudaError_t>
2514
+ SortKeys(void* d_temp_storage,
2515
+ size_t& temp_storage_bytes,
2516
+ DoubleBuffer<KeyT>& d_keys,
2517
+ NumItemsT num_items,
2518
+ DecomposerT decomposer,
2519
+ cudaStream_t stream = 0)
2520
+ {
2521
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2522
+
2523
+ // unsigned integer type for global offsets
2524
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2525
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2526
+
2527
+ static_assert(decomposer_check_t::value,
2528
+ "DecomposerT must be a callable object returning a tuple of references to "
2529
+ "arithmetic types");
2530
+
2531
+ constexpr bool is_overwrite_okay = true;
2532
+ DoubleBuffer<NullType> d_values;
2533
+
2534
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2535
+ decomposer_check_t{},
2536
+ d_temp_storage,
2537
+ temp_storage_bytes,
2538
+ is_overwrite_okay,
2539
+ d_keys,
2540
+ d_values,
2541
+ static_cast<offset_t>(num_items),
2542
+ decomposer,
2543
+ stream);
2544
+ }
2545
+
2546
+ //! @rst
2547
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2548
+ //!
2549
+ //! * The sorting operation is given a pair of key buffers managed by a
2550
+ //! DoubleBuffer structure that indicates which of the two buffers is
2551
+ //! "current" (and thus contains the input data to be sorted).
2552
+ //! * The contents of both buffers may be altered by the sorting operation.
2553
+ //! * In-place operations are not supported. There must be no overlap between
2554
+ //! any of the provided ranges:
2555
+ //!
2556
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2557
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2558
+ //!
2559
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2560
+ //! differentiating key bits. This can reduce overall sorting overhead and
2561
+ //! yield a corresponding performance improvement.
2562
+ //! * Upon completion, the sorting operation will update the "current"
2563
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2564
+ //! buffers now contains the sorted output sequence (a function of the
2565
+ //! number of key bits specified and the targeted device architecture).
2566
+ //! * @devicestorageP
2567
+ //! * @devicestorage
2568
+ //!
2569
+ //! Snippet
2570
+ //! --------------------------------------------------
2571
+ //!
2572
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2573
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2574
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2575
+ //! tuple of references to relevant members of the key.
2576
+ //!
2577
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2578
+ //! :language: c++
2579
+ //! :dedent:
2580
+ //! :start-after: example-begin custom-type
2581
+ //! :end-before: example-end custom-type
2582
+ //!
2583
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2584
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2585
+ //!
2586
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2587
+ //! :language: c++
2588
+ //! :dedent:
2589
+ //! :start-after: example-begin keys-bits-db
2590
+ //! :end-before: example-end keys-bits-db
2591
+ //!
2592
+ //! @endrst
2593
+ //!
2594
+ //! @tparam KeyT
2595
+ //! **[inferred]** KeyT type
2596
+ //!
2597
+ //! @tparam NumItemsT
2598
+ //! **[inferred]** Type of num_items
2599
+ //!
2600
+ //! @tparam DecomposerT
2601
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2602
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2603
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2604
+ //! The leftmost element of the tuple is considered the most significant.
2605
+ //! The call operator must not modify members of the key.
2606
+ //!
2607
+ //! @param[in] d_temp_storage
2608
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2609
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2610
+ //! is done.
2611
+ //!
2612
+ //! @param[in,out] temp_storage_bytes
2613
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2614
+ //!
2615
+ //! @param[in,out] d_keys
2616
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2617
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2618
+ //! point to the sorted output keys
2619
+ //!
2620
+ //! @param[in] num_items
2621
+ //! Number of items to sort
2622
+ //!
2623
+ //! @param decomposer
2624
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2625
+ //! references to its constituent arithmetic types. The leftmost element of
2626
+ //! the tuple is considered the most significant. The call operator must not
2627
+ //! modify members of the key.
2628
+ //!
2629
+ //! @param[in] begin_bit
2630
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2631
+ //! key comparison
2632
+ //!
2633
+ //! @param[in] end_bit
2634
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2635
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2636
+ //!
2637
+ //! @param[in] stream
2638
+ //! **[optional]** CUDA stream to launch kernels within.
2639
+ //! Default is stream<sub>0</sub>.
2640
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2641
+ CUB_RUNTIME_FUNCTION static //
2642
+ ::cuda::std::enable_if_t< //
2643
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2644
+ cudaError_t>
2645
+ SortKeys(void* d_temp_storage,
2646
+ size_t& temp_storage_bytes,
2647
+ DoubleBuffer<KeyT>& d_keys,
2648
+ NumItemsT num_items,
2649
+ DecomposerT decomposer,
2650
+ int begin_bit,
2651
+ int end_bit,
2652
+ cudaStream_t stream = 0)
2653
+ {
2654
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2655
+
2656
+ // unsigned integer type for global offsets
2657
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2658
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2659
+
2660
+ static_assert(decomposer_check_t::value,
2661
+ "DecomposerT must be a callable object returning a tuple of references to "
2662
+ "arithmetic types");
2663
+
2664
+ constexpr bool is_overwrite_okay = true;
2665
+ DoubleBuffer<NullType> d_values;
2666
+
2667
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2668
+ decomposer_check_t{},
2669
+ d_temp_storage,
2670
+ temp_storage_bytes,
2671
+ is_overwrite_okay,
2672
+ d_keys,
2673
+ d_values,
2674
+ static_cast<offset_t>(num_items),
2675
+ decomposer,
2676
+ begin_bit,
2677
+ end_bit,
2678
+ stream);
2679
+ }
2680
+
2681
+ //! @rst Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2682
+ //!
2683
+ //! - The contents of the input data are not altered by the sorting operation.
2684
+ //! - Pointers to contiguous memory must be used; iterators are not currently
2685
+ //! supported.
2686
+ //! - In-place operations are not supported. There must be no overlap between
2687
+ //! any of the provided ranges:
2688
+ //!
2689
+ //! - ``[d_keys_in, d_keys_in + num_items)``
2690
+ //! - ``[d_keys_out, d_keys_out + num_items)``
2691
+ //!
2692
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2693
+ //! bits can be specified. This can reduce overall sorting overhead and
2694
+ //! yield a corresponding performance improvement.
2695
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
2696
+ //! the sorting interface using DoubleBuffer wrappers below.
2697
+ //! - @devicestorage
2698
+ //!
2699
+ //! Snippet
2700
+ //! --------------------------------------------------
2701
+ //!
2702
+ //! The code snippet below illustrates the sorting of a device vector of
2703
+ //! ``int`` keys.
2704
+ //! @endrst
2705
+ //!
2706
+ //! @code{.cpp}
2707
+ //! #include <cub/cub.cuh>
2708
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2709
+ //!
2710
+ //! // Declare, allocate, and initialize device-accessible pointers
2711
+ //! // for sorting data
2712
+ //! int num_items; // e.g., 7
2713
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2714
+ //! int *d_keys_out; // e.g., [ ... ]
2715
+ //! ...
2716
+ //!
2717
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2718
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2719
+ //!
2720
+ //! // Determine temporary device storage requirements
2721
+ //! void *d_temp_storage = nullptr;
2722
+ //! size_t temp_storage_bytes = 0;
2723
+ //! cub::DeviceRadixSort::SortKeysDescending(
2724
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2725
+ //!
2726
+ //! // Allocate temporary storage
2727
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2728
+ //!
2729
+ //! // Run sorting operation
2730
+ //! cub::DeviceRadixSort::SortKeysDescending(
2731
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2732
+ //!
2733
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
2734
+ //! @endcode
2735
+ //!
2736
+ //! @tparam KeyT
2737
+ //! **[inferred]** KeyT type
2738
+ //!
2739
+ //! @tparam NumItemsT
2740
+ //! **[inferred]** Type of num_items
2741
+ //!
2742
+ //! @param[in] d_temp_storage
2743
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2744
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2745
+ //! is done.
2746
+ //!
2747
+ //! @param[in,out] temp_storage_bytes
2748
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2749
+ //!
2750
+ //! @param[in] d_keys_in
2751
+ //! Pointer to the input data of key data to sort
2752
+ //!
2753
+ //! @param[out] d_keys_out
2754
+ //! Pointer to the sorted output sequence of key data
2755
+ //!
2756
+ //! @param[in] num_items
2757
+ //! Number of items to sort
2758
+ //!
2759
+ //! @param[in] begin_bit
2760
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2761
+ //! key comparison
2762
+ //!
2763
+ //! @param[in] end_bit
2764
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2765
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2766
+ //!
2767
+ //! @param[in] stream
2768
+ //! **[optional]** CUDA stream to launch kernels within.
2769
+ //! Default is stream<sub>0</sub>.
2770
+ template <typename KeyT, typename NumItemsT>
2771
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
2772
+ void* d_temp_storage,
2773
+ size_t& temp_storage_bytes,
2774
+ const KeyT* d_keys_in,
2775
+ KeyT* d_keys_out,
2776
+ NumItemsT num_items,
2777
+ int begin_bit = 0,
2778
+ int end_bit = sizeof(KeyT) * 8,
2779
+ cudaStream_t stream = 0)
2780
+ {
2781
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2782
+
2783
+ // Unsigned integer type for global offsets.
2784
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2785
+
2786
+ // We cast away const-ness, but will *not* write to these arrays.
2787
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2788
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2789
+ // is not set.
2790
+ constexpr bool is_overwrite_okay = false;
2791
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2792
+ DoubleBuffer<NullType> d_values;
2793
+
2794
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
2795
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2796
+ }
2797
+
2798
+ //! @rst
2799
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2800
+ //!
2801
+ //! * The contents of the input data are not altered by the sorting operation.
2802
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2803
+ //! supported.
2804
+ //! * In-place operations are not supported. There must be no overlap between
2805
+ //! any of the provided ranges:
2806
+ //!
2807
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2808
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2809
+ //!
2810
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2811
+ //! bits can be specified. This can reduce overall sorting overhead and
2812
+ //! yield a corresponding performance improvement.
2813
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2814
+ //! the sorting interface using DoubleBuffer wrappers below.
2815
+ //! * @devicestorage
2816
+ //!
2817
+ //! Snippet
2818
+ //! --------------------------------------------------
2819
+ //!
2820
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2821
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2822
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2823
+ //! tuple of references to relevant members of the key.
2824
+ //!
2825
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2826
+ //! :language: c++
2827
+ //! :dedent:
2828
+ //! :start-after: example-begin custom-type
2829
+ //! :end-before: example-end custom-type
2830
+ //!
2831
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2832
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2833
+ //!
2834
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2835
+ //! :language: c++
2836
+ //! :dedent:
2837
+ //! :start-after: example-begin keys-descending-bits
2838
+ //! :end-before: example-end keys-descending-bits
2839
+ //!
2840
+ //! @endrst
2841
+ //!
2842
+ //! @tparam KeyT
2843
+ //! **[inferred]** KeyT type
2844
+ //!
2845
+ //! @tparam NumItemsT
2846
+ //! **[inferred]** Type of num_items
2847
+ //!
2848
+ //! @tparam DecomposerT
2849
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2850
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2851
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2852
+ //! The leftmost element of the tuple is considered the most significant.
2853
+ //! The call operator must not modify members of the key.
2854
+ //!
2855
+ //! @param[in] d_temp_storage
2856
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2857
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2858
+ //! is done.
2859
+ //!
2860
+ //! @param[in,out] temp_storage_bytes
2861
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2862
+ //!
2863
+ //! @param[in] d_keys_in
2864
+ //! Pointer to the input data of key data to sort
2865
+ //!
2866
+ //! @param[out] d_keys_out
2867
+ //! Pointer to the sorted output sequence of key data
2868
+ //!
2869
+ //! @param[in] num_items
2870
+ //! Number of items to sort
2871
+ //!
2872
+ //! @param decomposer
2873
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2874
+ //! references to its constituent arithmetic types. The leftmost element of
2875
+ //! the tuple is considered the most significant. The call operator must not
2876
+ //! modify members of the key.
2877
+ //!
2878
+ //! @param[in] begin_bit
2879
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2880
+ //! key comparison
2881
+ //!
2882
+ //! @param[in] end_bit
2883
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2884
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2885
+ //!
2886
+ //! @param[in] stream
2887
+ //! **[optional]** CUDA stream to launch kernels within.
2888
+ //! Default is stream<sub>0</sub>.
2889
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2890
+ CUB_RUNTIME_FUNCTION static //
2891
+ ::cuda::std::enable_if_t< //
2892
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2893
+ cudaError_t>
2894
+ SortKeysDescending(
2895
+ void* d_temp_storage,
2896
+ size_t& temp_storage_bytes,
2897
+ const KeyT* d_keys_in,
2898
+ KeyT* d_keys_out,
2899
+ NumItemsT num_items,
2900
+ DecomposerT decomposer,
2901
+ int begin_bit,
2902
+ int end_bit,
2903
+ cudaStream_t stream = 0)
2904
+ {
2905
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2906
+
2907
+ // unsigned integer type for global offsets
2908
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2909
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2910
+
2911
+ static_assert(decomposer_check_t::value,
2912
+ "DecomposerT must be a callable object returning a tuple of references to "
2913
+ "arithmetic types");
2914
+
2915
+ // We cast away const-ness, but will *not* write to these arrays.
2916
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2917
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2918
+ // is not set.
2919
+ constexpr bool is_overwrite_okay = false;
2920
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2921
+ DoubleBuffer<NullType> d_values;
2922
+
2923
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
2924
+ decomposer_check_t{},
2925
+ d_temp_storage,
2926
+ temp_storage_bytes,
2927
+ is_overwrite_okay,
2928
+ d_keys,
2929
+ d_values,
2930
+ static_cast<offset_t>(num_items),
2931
+ decomposer,
2932
+ begin_bit,
2933
+ end_bit,
2934
+ stream);
2935
+ }
2936
+
2937
+ //! @rst
2938
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2939
+ //!
2940
+ //! * The contents of the input data are not altered by the sorting operation.
2941
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2942
+ //! supported.
2943
+ //! * In-place operations are not supported. There must be no overlap between
2944
+ //! any of the provided ranges:
2945
+ //!
2946
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2947
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2948
+ //!
2949
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2950
+ //! the sorting interface using DoubleBuffer wrappers below.
2951
+ //! * @devicestorage
2952
+ //!
2953
+ //! Snippet
2954
+ //! --------------------------------------------------
2955
+ //!
2956
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2957
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2958
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2959
+ //! tuple of references to relevant members of the key.
2960
+ //!
2961
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2962
+ //! :language: c++
2963
+ //! :dedent:
2964
+ //! :start-after: example-begin custom-type
2965
+ //! :end-before: example-end custom-type
2966
+ //!
2967
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2968
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2969
+ //!
2970
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2971
+ //! :language: c++
2972
+ //! :dedent:
2973
+ //! :start-after: example-begin keys-descending
2974
+ //! :end-before: example-end keys-descending
2975
+ //!
2976
+ //! @endrst
2977
+ //!
2978
+ //! @tparam KeyT
2979
+ //! **[inferred]** KeyT type
2980
+ //!
2981
+ //! @tparam NumItemsT
2982
+ //! **[inferred]** Type of num_items
2983
+ //!
2984
+ //! @tparam DecomposerT
2985
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2986
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2987
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2988
+ //! The leftmost element of the tuple is considered the most significant.
2989
+ //! The call operator must not modify members of the key.
2990
+ //!
2991
+ //! @param[in] d_temp_storage
2992
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2993
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2994
+ //! is done.
2995
+ //!
2996
+ //! @param[in,out] temp_storage_bytes
2997
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2998
+ //!
2999
+ //! @param[in] d_keys_in
3000
+ //! Pointer to the input data of key data to sort
3001
+ //!
3002
+ //! @param[out] d_keys_out
3003
+ //! Pointer to the sorted output sequence of key data
3004
+ //!
3005
+ //! @param[in] num_items
3006
+ //! Number of items to sort
3007
+ //!
3008
+ //! @param decomposer
3009
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3010
+ //! references to its constituent arithmetic types. The leftmost element of
3011
+ //! the tuple is considered the most significant. The call operator must not
3012
+ //! modify members of the key.
3013
+ //!
3014
+ //! @param[in] stream
3015
+ //! **[optional]** CUDA stream to launch kernels within.
3016
+ //! Default is stream<sub>0</sub>.
3017
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3018
+ CUB_RUNTIME_FUNCTION static //
3019
+ ::cuda::std::enable_if_t< //
3020
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3021
+ cudaError_t>
3022
+ SortKeysDescending(
3023
+ void* d_temp_storage,
3024
+ size_t& temp_storage_bytes,
3025
+ const KeyT* d_keys_in,
3026
+ KeyT* d_keys_out,
3027
+ NumItemsT num_items,
3028
+ DecomposerT decomposer,
3029
+ cudaStream_t stream = 0)
3030
+ {
3031
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3032
+
3033
+ // unsigned integer type for global offsets
3034
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3035
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3036
+
3037
+ static_assert(decomposer_check_t::value,
3038
+ "DecomposerT must be a callable object returning a tuple of references to "
3039
+ "arithmetic types");
3040
+
3041
+ // We cast away const-ness, but will *not* write to these arrays.
3042
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
3043
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
3044
+ // is not set.
3045
+ constexpr bool is_overwrite_okay = false;
3046
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
3047
+ DoubleBuffer<NullType> d_values;
3048
+
3049
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3050
+ decomposer_check_t{},
3051
+ d_temp_storage,
3052
+ temp_storage_bytes,
3053
+ is_overwrite_okay,
3054
+ d_keys,
3055
+ d_values,
3056
+ static_cast<offset_t>(num_items),
3057
+ decomposer,
3058
+ stream);
3059
+ }
3060
+
3061
+ //! @rst
3062
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3063
+ //!
3064
+ //! - The sorting operation is given a pair of key buffers managed by a
3065
+ //! DoubleBuffer structure that indicates which of the two buffers is
3066
+ //! "current" (and thus contains the input data to be sorted).
3067
+ //! - The contents of both buffers may be altered by the sorting operation.
3068
+ //! - In-place operations are not supported. There must be no overlap between
3069
+ //! any of the provided ranges:
3070
+ //!
3071
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
3072
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3073
+ //!
3074
+ //! - Upon completion, the sorting operation will update the "current"
3075
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3076
+ //! buffers now contains the sorted output sequence (a function of the
3077
+ //! number of key bits specified and the targeted device architecture).
3078
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
3079
+ //! bits can be specified. This can reduce overall sorting overhead and
3080
+ //! yield a corresponding performance improvement.
3081
+ //! - @devicestorageP
3082
+ //! - @devicestorage
3083
+ //!
3084
+ //! Snippet
3085
+ //! --------------------------------------------------
3086
+ //!
3087
+ //! The code snippet below illustrates the sorting of a device vector of ``int`` keys.
3088
+ //! @endrst
3089
+ //!
3090
+ //! @code{.cpp}
3091
+ //! #include <cub/cub.cuh>
3092
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
3093
+ //!
3094
+ //! // Declare, allocate, and initialize device-accessible pointers
3095
+ //! // for sorting data
3096
+ //! int num_items; // e.g., 7
3097
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
3098
+ //! int *d_key_alt_buf; // e.g., [ ... ]
3099
+ //! ...
3100
+ //!
3101
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
3102
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
3103
+ //!
3104
+ //! // Determine temporary device storage requirements
3105
+ //! void *d_temp_storage = nullptr;
3106
+ //! size_t temp_storage_bytes = 0;
3107
+ //! cub::DeviceRadixSort::SortKeysDescending(
3108
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3109
+ //!
3110
+ //! // Allocate temporary storage
3111
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
3112
+ //!
3113
+ //! // Run sorting operation
3114
+ //! cub::DeviceRadixSort::SortKeysDescending(
3115
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3116
+ //!
3117
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
3118
+ //! @endcode
3119
+ //!
3120
+ //! @tparam KeyT
3121
+ //! **[inferred]** KeyT type
3122
+ //!
3123
+ //! @tparam NumItemsT
3124
+ //! **[inferred]** Type of num_items
3125
+ //!
3126
+ //! @param[in] d_temp_storage
3127
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3128
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3129
+ //! is done.
3130
+ //!
3131
+ //! @param[in,out] temp_storage_bytes
3132
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3133
+ //!
3134
+ //! @param[in,out] d_keys
3135
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3136
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3137
+ //! point to the sorted output keys
3138
+ //!
3139
+ //! @param[in] num_items
3140
+ //! Number of items to sort
3141
+ //!
3142
+ //! @param[in] begin_bit
3143
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3144
+ //! key comparison
3145
+ //!
3146
+ //! @param[in] end_bit
3147
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3148
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
3149
+ //!
3150
+ //! @param[in] stream
3151
+ //! **[optional]** CUDA stream to launch kernels within.
3152
+ //! Default is stream<sub>0</sub>.
3153
+ template <typename KeyT, typename NumItemsT>
3154
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
3155
+ void* d_temp_storage,
3156
+ size_t& temp_storage_bytes,
3157
+ DoubleBuffer<KeyT>& d_keys,
3158
+ NumItemsT num_items,
3159
+ int begin_bit = 0,
3160
+ int end_bit = sizeof(KeyT) * 8,
3161
+ cudaStream_t stream = 0)
3162
+ {
3163
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3164
+
3165
+ // Unsigned integer type for global offsets.
3166
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
3167
+
3168
+ constexpr bool is_overwrite_okay = true;
3169
+
3170
+ // Null value type
3171
+ DoubleBuffer<NullType> d_values;
3172
+
3173
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
3174
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
3175
+ }
3176
+
3177
+ //! @rst
3178
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3179
+ //!
3180
+ //! * The sorting operation is given a pair of key buffers managed by a
3181
+ //! DoubleBuffer structure that indicates which of the two buffers is
3182
+ //! "current" (and thus contains the input data to be sorted).
3183
+ //! * The contents of both buffers may be altered by the sorting operation.
3184
+ //! * In-place operations are not supported. There must be no overlap between
3185
+ //! any of the provided ranges:
3186
+ //!
3187
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3188
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3189
+ //!
3190
+ //! * Upon completion, the sorting operation will update the "current"
3191
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3192
+ //! buffers now contains the sorted output sequence (a function of the
3193
+ //! number of key bits specified and the targeted device architecture).
3194
+ //! * @devicestorageP
3195
+ //! * @devicestorage
3196
+ //!
3197
+ //! Snippet
3198
+ //! --------------------------------------------------
3199
+ //!
3200
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3201
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3202
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3203
+ //! tuple of references to relevant members of the key.
3204
+ //!
3205
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3206
+ //! :language: c++
3207
+ //! :dedent:
3208
+ //! :start-after: example-begin custom-type
3209
+ //! :end-before: example-end custom-type
3210
+ //!
3211
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3212
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3213
+ //!
3214
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3215
+ //! :language: c++
3216
+ //! :dedent:
3217
+ //! :start-after: example-begin keys-descending-db
3218
+ //! :end-before: example-end keys-descending-db
3219
+ //!
3220
+ //! @endrst
3221
+ //!
3222
+ //! @tparam KeyT
3223
+ //! **[inferred]** KeyT type
3224
+ //!
3225
+ //! @tparam NumItemsT
3226
+ //! **[inferred]** Type of num_items
3227
+ //!
3228
+ //! @tparam DecomposerT
3229
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3230
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3231
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3232
+ //! The leftmost element of the tuple is considered the most significant.
3233
+ //! The call operator must not modify members of the key.
3234
+ //!
3235
+ //! @param[in] d_temp_storage
3236
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3237
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3238
+ //! is done.
3239
+ //!
3240
+ //! @param[in,out] temp_storage_bytes
3241
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3242
+ //!
3243
+ //! @param[in,out] d_keys
3244
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3245
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3246
+ //! point to the sorted output keys
3247
+ //!
3248
+ //! @param[in] num_items
3249
+ //! Number of items to sort
3250
+ //!
3251
+ //! @param decomposer
3252
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3253
+ //! references to its constituent arithmetic types. The leftmost element of
3254
+ //! the tuple is considered the most significant. The call operator must not
3255
+ //! modify members of the key.
3256
+ //!
3257
+ //! @param[in] stream
3258
+ //! **[optional]** CUDA stream to launch kernels within.
3259
+ //! Default is stream<sub>0</sub>.
3260
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3261
+ CUB_RUNTIME_FUNCTION static //
3262
+ ::cuda::std::enable_if_t< //
3263
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3264
+ cudaError_t>
3265
+ SortKeysDescending(
3266
+ void* d_temp_storage,
3267
+ size_t& temp_storage_bytes,
3268
+ DoubleBuffer<KeyT>& d_keys,
3269
+ NumItemsT num_items,
3270
+ DecomposerT decomposer,
3271
+ cudaStream_t stream = 0)
3272
+ {
3273
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3274
+
3275
+ // unsigned integer type for global offsets
3276
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3277
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3278
+
3279
+ static_assert(decomposer_check_t::value,
3280
+ "DecomposerT must be a callable object returning a tuple of references to "
3281
+ "arithmetic types");
3282
+
3283
+ constexpr bool is_overwrite_okay = true;
3284
+ DoubleBuffer<NullType> d_values;
3285
+
3286
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3287
+ decomposer_check_t{},
3288
+ d_temp_storage,
3289
+ temp_storage_bytes,
3290
+ is_overwrite_okay,
3291
+ d_keys,
3292
+ d_values,
3293
+ static_cast<offset_t>(num_items),
3294
+ decomposer,
3295
+ stream);
3296
+ }
3297
+
3298
+ //! @rst
3299
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3300
+ //!
3301
+ //! * The sorting operation is given a pair of key buffers managed by a
3302
+ //! DoubleBuffer structure that indicates which of the two buffers is
3303
+ //! "current" (and thus contains the input data to be sorted).
3304
+ //! * The contents of both buffers may be altered by the sorting operation.
3305
+ //! * In-place operations are not supported. There must be no overlap between
3306
+ //! any of the provided ranges:
3307
+ //!
3308
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3309
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3310
+ //!
3311
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
3312
+ //! differentiating key bits. This can reduce overall sorting overhead and
3313
+ //! yield a corresponding performance improvement.
3314
+ //! * Upon completion, the sorting operation will update the "current"
3315
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3316
+ //! buffers now contains the sorted output sequence (a function of the
3317
+ //! number of key bits specified and the targeted device architecture).
3318
+ //! * @devicestorageP
3319
+ //! * @devicestorage
3320
+ //!
3321
+ //! Snippet
3322
+ //! --------------------------------------------------
3323
+ //!
3324
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3325
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3326
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3327
+ //! tuple of references to relevant members of the key.
3328
+ //!
3329
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3330
+ //! :language: c++
3331
+ //! :dedent:
3332
+ //! :start-after: example-begin custom-type
3333
+ //! :end-before: example-end custom-type
3334
+ //!
3335
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3336
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3337
+ //!
3338
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3339
+ //! :language: c++
3340
+ //! :dedent:
3341
+ //! :start-after: example-begin keys-descending-bits-db
3342
+ //! :end-before: example-end keys-descending-bits-db
3343
+ //!
3344
+ //! @endrst
3345
+ //!
3346
+ //! @tparam KeyT
3347
+ //! **[inferred]** KeyT type
3348
+ //!
3349
+ //! @tparam NumItemsT
3350
+ //! **[inferred]** Type of num_items
3351
+ //!
3352
+ //! @tparam DecomposerT
3353
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3354
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3355
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3356
+ //! The leftmost element of the tuple is considered the most significant.
3357
+ //! The call operator must not modify members of the key.
3358
+ //!
3359
+ //! @param[in] d_temp_storage
3360
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3361
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3362
+ //! is done.
3363
+ //!
3364
+ //! @param[in,out] temp_storage_bytes
3365
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3366
+ //!
3367
+ //! @param[in,out] d_keys
3368
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3369
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3370
+ //! point to the sorted output keys
3371
+ //!
3372
+ //! @param[in] num_items
3373
+ //! Number of items to sort
3374
+ //!
3375
+ //! @param decomposer
3376
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3377
+ //! references to its constituent arithmetic types. The leftmost element of
3378
+ //! the tuple is considered the most significant. The call operator must not
3379
+ //! modify members of the key.
3380
+ //!
3381
+ //! @param[in] begin_bit
3382
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3383
+ //! key comparison
3384
+ //!
3385
+ //! @param[in] end_bit
3386
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3387
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
3388
+ //!
3389
+ //! @param[in] stream
3390
+ //! **[optional]** CUDA stream to launch kernels within.
3391
+ //! Default is stream<sub>0</sub>.
3392
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3393
+ CUB_RUNTIME_FUNCTION static //
3394
+ ::cuda::std::enable_if_t< //
3395
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3396
+ cudaError_t>
3397
+ SortKeysDescending(
3398
+ void* d_temp_storage,
3399
+ size_t& temp_storage_bytes,
3400
+ DoubleBuffer<KeyT>& d_keys,
3401
+ NumItemsT num_items,
3402
+ DecomposerT decomposer,
3403
+ int begin_bit,
3404
+ int end_bit,
3405
+ cudaStream_t stream = 0)
3406
+ {
3407
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3408
+
3409
+ // unsigned integer type for global offsets
3410
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3411
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3412
+
3413
+ static_assert(decomposer_check_t::value,
3414
+ "DecomposerT must be a callable object returning a tuple of references to "
3415
+ "arithmetic types");
3416
+
3417
+ constexpr bool is_overwrite_okay = true;
3418
+ DoubleBuffer<NullType> d_values;
3419
+
3420
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3421
+ decomposer_check_t{},
3422
+ d_temp_storage,
3423
+ temp_storage_bytes,
3424
+ is_overwrite_okay,
3425
+ d_keys,
3426
+ d_values,
3427
+ static_cast<offset_t>(num_items),
3428
+ decomposer,
3429
+ begin_bit,
3430
+ end_bit,
3431
+ stream);
3432
+ }
3433
+
3434
+ //! @} end member group
3435
+ };
3436
+
3437
+ CUB_NAMESPACE_END