cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,3413 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ //! @file
6
+ //! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
7
+ //! items residing within device-accessible memory.
8
+
9
+ #pragma once
10
+
11
+ #include <cub/config.cuh>
12
+
13
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
14
+ # pragma GCC system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
16
+ # pragma clang system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
18
+ # pragma system_header
19
+ #endif // no system header
20
+
21
+ #include <cub/detail/choose_offset.cuh>
22
+ #include <cub/device/dispatch/dispatch_radix_sort.cuh>
23
+
24
+ #include <cuda/std/__type_traits/enable_if.h>
25
+ #include <cuda/std/__type_traits/integral_constant.h>
26
+ #include <cuda/std/__type_traits/is_convertible.h>
27
+
28
+ CUB_NAMESPACE_BEGIN
29
+
30
+ //! @rst
31
+ //! DeviceRadixSort provides device-wide, parallel operations for
32
+ //! computing a radix sort across a sequence of data items residing
33
+ //! within device-accessible memory.
34
+ //!
35
+ //! .. image:: ../../img/sorting_logo.png
36
+ //! :align: center
37
+ //!
38
+ //! Overview
39
+ //! --------------------------------------------------
40
+ //!
41
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
42
+ //! arranges items into ascending (or descending) order. The algorithm relies
43
+ //! upon a positional representation for keys, i.e., each key is comprised of an
44
+ //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
45
+ //! least-significant to most-significant. For a given input sequence of keys
46
+ //! and a set of rules specifying a total ordering of the symbolic alphabet, the
47
+ //! radix sorting method produces a lexicographic ordering of those keys.
48
+ //!
49
+ //! @rowmajor
50
+ //!
51
+ //! Supported Types
52
+ //! --------------------------------------------------
53
+ //!
54
+ //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
55
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
56
+ //! and ``__nv_bfloat16`` 16-bit floating-point types. User-defined types are
57
+ //! supported as long as a decomposer object is provided.
58
+ //!
59
+ //! Floating-Point Special Cases
60
+ //! --------------------------------------------------
61
+ //!
62
+ //! - Positive and negative zeros are considered equivalent, and will be treated
63
+ //! as such in the output.
64
+ //! - No special handling is implemented for NaN values; these are sorted
65
+ //! according to their bit representations after any transformations.
66
+ //!
67
+ //! Transformations
68
+ //! --------------------------------------------------
69
+ //!
70
+ //! Although the direct radix sorting method can only be applied to unsigned
71
+ //! integral types, DeviceRadixSort is able to sort signed and floating-point
72
+ //! types via simple bit-wise transformations that ensure lexicographic key
73
+ //! ordering. Additional transformations occur for descending sorts. These
74
+ //! transformations must be considered when restricting the
75
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
76
+ //! before the bit-range truncation.
77
+ //!
78
+ //! Any transformations applied to the keys prior to sorting are reversed
79
+ //! while writing to the final output buffer.
80
+ //!
81
+ //! Type Specific Bitwise Transformations
82
+ //! --------------------------------------------------
83
+ //!
84
+ //! To convert the input values into a radix-sortable bitwise representation,
85
+ //! the following transformations take place prior to sorting:
86
+ //!
87
+ //! - For unsigned integral values, the keys are used directly.
88
+ //! - For signed integral values, the sign bit is inverted.
89
+ //! - For positive floating point values, the sign bit is inverted.
90
+ //! - For negative floating point values, the full key is inverted.
91
+ //!
92
+ //! For floating point types, positive and negative zero are a special case and
93
+ //! will be considered equivalent during sorting.
94
+ //!
95
+ //! Descending Sort Bitwise Transformations
96
+ //! --------------------------------------------------
97
+ //!
98
+ //! If descending sort is used, the keys are inverted after performing any
99
+ //! type-specific transformations, and the resulting keys are sorted in ascending
100
+ //! order.
101
+ //!
102
+ //! Stability
103
+ //! --------------------------------------------------
104
+ //!
105
+ //! DeviceRadixSort is stable. For floating-point types, ``-0.0`` and ``+0.0`` are
106
+ //! considered equal and appear in the result in the same order as they appear in
107
+ //! the input.
108
+ //!
109
+ //! Usage Considerations
110
+ //! --------------------------------------------------
111
+ //!
112
+ //! @cdp_class{DeviceRadixSort}
113
+ //!
114
+ //! Performance
115
+ //! --------------------------------------------------
116
+ //!
117
+ //! @linear_performance{radix sort}
118
+ //!
119
+ //! @endrst
120
+ struct DeviceRadixSort
121
+ {
122
+ private:
123
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
124
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
125
+ ::cuda::std::false_type,
126
+ void* d_temp_storage,
127
+ size_t& temp_storage_bytes,
128
+ bool is_overwrite_okay,
129
+ DoubleBuffer<KeyT>& d_keys,
130
+ DoubleBuffer<ValueT>& d_values,
131
+ NumItemsT num_items,
132
+ DecomposerT decomposer,
133
+ int begin_bit,
134
+ int end_bit,
135
+ cudaStream_t stream);
136
+
137
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
138
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
139
+ ::cuda::std::true_type,
140
+ void* d_temp_storage,
141
+ size_t& temp_storage_bytes,
142
+ bool is_overwrite_okay,
143
+ DoubleBuffer<KeyT>& d_keys,
144
+ DoubleBuffer<ValueT>& d_values,
145
+ OffsetT num_items,
146
+ DecomposerT decomposer,
147
+ int begin_bit,
148
+ int end_bit,
149
+ cudaStream_t stream)
150
+ {
151
+ return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, DecomposerT>::Dispatch(
152
+ d_temp_storage,
153
+ temp_storage_bytes,
154
+ d_keys,
155
+ d_values,
156
+ static_cast<OffsetT>(num_items),
157
+ begin_bit,
158
+ end_bit,
159
+ is_overwrite_okay,
160
+ stream,
161
+ decomposer);
162
+ }
163
+
164
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
165
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
166
+ ::cuda::std::false_type,
167
+ void* d_temp_storage,
168
+ size_t& temp_storage_bytes,
169
+ bool is_overwrite_okay,
170
+ DoubleBuffer<KeyT>& d_keys,
171
+ DoubleBuffer<ValueT>& d_values,
172
+ NumItemsT num_items,
173
+ DecomposerT decomposer,
174
+ cudaStream_t stream);
175
+
176
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
177
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
178
+ ::cuda::std::true_type,
179
+ void* d_temp_storage,
180
+ size_t& temp_storage_bytes,
181
+ bool is_overwrite_okay,
182
+ DoubleBuffer<KeyT>& d_keys,
183
+ DoubleBuffer<ValueT>& d_values,
184
+ OffsetT num_items,
185
+ DecomposerT decomposer,
186
+ cudaStream_t stream)
187
+ {
188
+ constexpr int begin_bit = 0;
189
+ const int end_bit = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
190
+
191
+ return DeviceRadixSort::custom_radix_sort<Order>(
192
+ ::cuda::std::true_type{},
193
+ d_temp_storage,
194
+ temp_storage_bytes,
195
+ is_overwrite_okay,
196
+ d_keys,
197
+ d_values,
198
+ num_items,
199
+ decomposer,
200
+ begin_bit,
201
+ end_bit,
202
+ stream);
203
+ }
204
+
205
+ // Name reported for NVTX ranges
206
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
207
+ {
208
+ return "cub::DeviceRadixSort";
209
+ }
210
+
211
+ public:
212
+ //! @name KeyT-value pairs
213
+ //! @{
214
+
215
+ //! @rst
216
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
217
+ //!
218
+ //! - The contents of the input data are not altered by the sorting operation.
219
+ //! - Pointers to contiguous memory must be used; iterators are not currently
220
+ //! supported.
221
+ //! - In-place operations are not supported. There must be no overlap between
222
+ //! any of the provided ranges:
223
+ //!
224
+ //! - ``[d_keys_in, d_keys_in + num_items)``
225
+ //! - ``[d_keys_out, d_keys_out + num_items)``
226
+ //! - ``[d_values_in, d_values_in + num_items)``
227
+ //! - ``[d_values_out, d_values_out + num_items)``
228
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
229
+ //! bits can be specified. This can reduce overall sorting overhead and
230
+ //! yield a corresponding performance improvement.
231
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
232
+ //! the sorting interface using DoubleBuffer wrappers below.
233
+ //! - @devicestorage
234
+ //!
235
+ //! Snippet
236
+ //! --------------------------------------------------
237
+ //!
238
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
239
+ //! keys with associated vector of ``int`` values.
240
+ //! @endrst
241
+ //!
242
+ //! @code{.cpp}
243
+ //! #include <cub/cub.cuh>
244
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
245
+ //!
246
+ //! // Declare, allocate, and initialize device-accessible pointers
247
+ //! // for sorting data
248
+ //! int num_items; // e.g., 7
249
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
250
+ //! int *d_keys_out; // e.g., [ ... ]
251
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
252
+ //! int *d_values_out; // e.g., [ ... ]
253
+ //! ...
254
+ //!
255
+ //! // Determine temporary device storage requirements
256
+ //! void *d_temp_storage = nullptr;
257
+ //! size_t temp_storage_bytes = 0;
258
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
259
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
260
+ //!
261
+ //! // Allocate temporary storage
262
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
263
+ //!
264
+ //! // Run sorting operation
265
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
266
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
267
+ //!
268
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
269
+ //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
270
+ //! @endcode
271
+ //!
272
+ //! @tparam KeyT
273
+ //! **[inferred]** KeyT type
274
+ //!
275
+ //! @tparam ValueT
276
+ //! **[inferred]** ValueT type
277
+ //!
278
+ //! @tparam NumItemsT
279
+ //! **[inferred]** Type of num_items
280
+ //!
281
+ //! @param[in] d_temp_storage
282
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
283
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
284
+ //! is done.
285
+ //!
286
+ //! @param[in,out] temp_storage_bytes
287
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
288
+ //!
289
+ //! @param[in] d_keys_in
290
+ //! Pointer to the input data of key data to sort
291
+ //!
292
+ //! @param[out] d_keys_out
293
+ //! Pointer to the sorted output sequence of key data
294
+ //!
295
+ //! @param[in] d_values_in
296
+ //! Pointer to the corresponding input sequence of associated value items
297
+ //!
298
+ //! @param[out] d_values_out
299
+ //! Pointer to the correspondingly-reordered output sequence of associated
300
+ //! value items
301
+ //!
302
+ //! @param[in] num_items
303
+ //! Number of items to sort
304
+ //!
305
+ //! @param[in] begin_bit
306
+ //! **[optional]** The least-significant bit index (inclusive) needed for
307
+ //! key comparison
308
+ //!
309
+ //! @param[in] end_bit
310
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
311
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
312
+ //!
313
+ //! @param[in] stream
314
+ //! **[optional]** CUDA stream to launch kernels within.
315
+ //! Default is stream<sub>0</sub>.
316
+ template <typename KeyT, typename ValueT, typename NumItemsT>
317
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
318
+ void* d_temp_storage,
319
+ size_t& temp_storage_bytes,
320
+ const KeyT* d_keys_in,
321
+ KeyT* d_keys_out,
322
+ const ValueT* d_values_in,
323
+ ValueT* d_values_out,
324
+ NumItemsT num_items,
325
+ int begin_bit = 0,
326
+ int end_bit = sizeof(KeyT) * 8,
327
+ cudaStream_t stream = 0)
328
+ {
329
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
330
+ // Unsigned integer type for global offsets.
331
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
332
+
333
+ // TODO API that doesn't accept decomposer should also contain a static
334
+ // assert that the key type is fundamental.
335
+
336
+ // We cast away const-ness, but will *not* write to these arrays.
337
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
338
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
339
+ // is not set.
340
+ constexpr bool is_overwrite_okay = false;
341
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
342
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
343
+
344
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
345
+ d_temp_storage,
346
+ temp_storage_bytes,
347
+ d_keys,
348
+ d_values,
349
+ static_cast<OffsetT>(num_items),
350
+ begin_bit,
351
+ end_bit,
352
+ is_overwrite_okay,
353
+ stream);
354
+ }
355
+
356
+ //! @rst
357
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
358
+ //!
359
+ //! * The contents of the input data are not altered by the sorting operation.
360
+ //! * Pointers to contiguous memory must be used; iterators are not currently
361
+ //! supported.
362
+ //! * In-place operations are not supported. There must be no overlap between
363
+ //! any of the provided ranges:
364
+ //!
365
+ //! * ``[d_keys_in, d_keys_in + num_items)``
366
+ //! * ``[d_keys_out, d_keys_out + num_items)``
367
+ //! * ``[d_values_in, d_values_in + num_items)``
368
+ //! * ``[d_values_out, d_values_out + num_items)``
369
+ //!
370
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
371
+ //! differentiating key bits. This can reduce overall sorting overhead and
372
+ //! yield a corresponding performance improvement.
373
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
374
+ //! the sorting interface using DoubleBuffer wrappers below.
375
+ //! * @devicestorage
376
+ //!
377
+ //! Snippet
378
+ //! --------------------------------------------------
379
+ //!
380
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
381
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
382
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
383
+ //! tuple of references to relevant members of the key.
384
+ //!
385
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
386
+ //! :language: c++
387
+ //! :dedent:
388
+ //! :start-after: example-begin custom-type
389
+ //! :end-before: example-end custom-type
390
+ //!
391
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
392
+ //! using ``cub::DeviceRadixSort::SortPairs``:
393
+ //!
394
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
395
+ //! :language: c++
396
+ //! :dedent:
397
+ //! :start-after: example-begin pairs-bits
398
+ //! :end-before: example-end pairs-bits
399
+ //!
400
+ //! @endrst
401
+ //!
402
+ //! @tparam KeyT
403
+ //! **[inferred]** KeyT type
404
+ //!
405
+ //! @tparam ValueT
406
+ //! **[inferred]** ValueT type
407
+ //!
408
+ //! @tparam NumItemsT
409
+ //! **[inferred]** Type of num_items
410
+ //!
411
+ //! @tparam DecomposerT
412
+ //! **[inferred]** Type of a callable object responsible for decomposing a
413
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
414
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
415
+ //! The leftmost element of the tuple is considered the most significant.
416
+ //! The call operator must not modify members of the key.
417
+ //!
418
+ //! @param[in] d_temp_storage
419
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
420
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
421
+ //! is done.
422
+ //!
423
+ //! @param[in,out] temp_storage_bytes
424
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
425
+ //!
426
+ //! @param[in] d_keys_in
427
+ //! Pointer to the input data of key data to sort
428
+ //!
429
+ //! @param[out] d_keys_out
430
+ //! Pointer to the sorted output sequence of key data
431
+ //!
432
+ //! @param[in] d_values_in
433
+ //! Pointer to the corresponding input sequence of associated value items
434
+ //!
435
+ //! @param[out] d_values_out
436
+ //! Pointer to the correspondingly-reordered output sequence of associated
437
+ //! value items
438
+ //!
439
+ //! @param[in] num_items
440
+ //! Number of items to sort
441
+ //!
442
+ //! @param decomposer
443
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
444
+ //! references to its constituent arithmetic types. The leftmost element of
445
+ //! the tuple is considered the most significant. The call operator must not
446
+ //! modify members of the key.
447
+ //!
448
+ //! @param[in] begin_bit
449
+ //! **[optional]** The least-significant bit index (inclusive) needed for
450
+ //! key comparison
451
+ //!
452
+ //! @param[in] end_bit
453
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
454
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
455
+ //!
456
+ //! @param[in] stream
457
+ //! **[optional]** CUDA stream to launch kernels within.
458
+ //! Default is stream<sub>0</sub>.
459
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
460
+ CUB_RUNTIME_FUNCTION static //
461
+ ::cuda::std::enable_if_t< //
462
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
463
+ cudaError_t>
464
+ SortPairs(void* d_temp_storage,
465
+ size_t& temp_storage_bytes,
466
+ const KeyT* d_keys_in,
467
+ KeyT* d_keys_out,
468
+ const ValueT* d_values_in,
469
+ ValueT* d_values_out,
470
+ NumItemsT num_items,
471
+ DecomposerT decomposer,
472
+ int begin_bit,
473
+ int end_bit,
474
+ cudaStream_t stream = 0)
475
+ {
476
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
477
+ // unsigned integer type for global offsets
478
+ using offset_t = detail::choose_offset_t<NumItemsT>;
479
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
480
+
481
+ static_assert(decomposer_check_t::value,
482
+ "DecomposerT must be a callable object returning a tuple of references to "
483
+ "arithmetic types");
484
+
485
+ // We cast away const-ness, but will *not* write to these arrays.
486
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
487
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
488
+ // is not set.
489
+ constexpr bool is_overwrite_okay = false;
490
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
491
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
492
+
493
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
494
+ decomposer_check_t{},
495
+ d_temp_storage,
496
+ temp_storage_bytes,
497
+ is_overwrite_okay,
498
+ d_keys,
499
+ d_values,
500
+ static_cast<offset_t>(num_items),
501
+ decomposer,
502
+ begin_bit,
503
+ end_bit,
504
+ stream);
505
+ }
506
+
507
+ //! @rst
508
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
509
+ //!
510
+ //! * The contents of the input data are not altered by the sorting operation.
511
+ //! * Pointers to contiguous memory must be used; iterators are not currently
512
+ //! supported.
513
+ //! * In-place operations are not supported. There must be no overlap between
514
+ //! any of the provided ranges:
515
+ //!
516
+ //! * ``[d_keys_in, d_keys_in + num_items)``
517
+ //! * ``[d_keys_out, d_keys_out + num_items)``
518
+ //! * ``[d_values_in, d_values_in + num_items)``
519
+ //! * ``[d_values_out, d_values_out + num_items)``
520
+ //!
521
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
522
+ //! the sorting interface using DoubleBuffer wrappers below.
523
+ //! * @devicestorage
524
+ //!
525
+ //! Snippet
526
+ //! --------------------------------------------------
527
+ //!
528
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
529
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
530
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
531
+ //! tuple of references to relevant members of the key.
532
+ //!
533
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
534
+ //! :language: c++
535
+ //! :dedent:
536
+ //! :start-after: example-begin custom-type
537
+ //! :end-before: example-end custom-type
538
+ //!
539
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
540
+ //! using ``cub::DeviceRadixSort::SortPairs``:
541
+ //!
542
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
543
+ //! :language: c++
544
+ //! :dedent:
545
+ //! :start-after: example-begin pairs
546
+ //! :end-before: example-end pairs
547
+ //!
548
+ //! @endrst
549
+ //!
550
+ //! @tparam KeyT
551
+ //! **[inferred]** KeyT type
552
+ //!
553
+ //! @tparam ValueT
554
+ //! **[inferred]** ValueT type
555
+ //!
556
+ //! @tparam NumItemsT
557
+ //! **[inferred]** Type of num_items
558
+ //!
559
+ //! @tparam DecomposerT
560
+ //! **[inferred]** Type of a callable object responsible for decomposing a
561
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
562
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
563
+ //! The leftmost element of the tuple is considered the most significant.
564
+ //! The call operator must not modify members of the key.
565
+ //!
566
+ //! @param[in] d_temp_storage
567
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
568
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
569
+ //! is done.
570
+ //!
571
+ //! @param[in,out] temp_storage_bytes
572
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
573
+ //!
574
+ //! @param[in] d_keys_in
575
+ //! Pointer to the input data of key data to sort
576
+ //!
577
+ //! @param[out] d_keys_out
578
+ //! Pointer to the sorted output sequence of key data
579
+ //!
580
+ //! @param[in] d_values_in
581
+ //! Pointer to the corresponding input sequence of associated value items
582
+ //!
583
+ //! @param[out] d_values_out
584
+ //! Pointer to the correspondingly-reordered output sequence of associated
585
+ //! value items
586
+ //!
587
+ //! @param[in] num_items
588
+ //! Number of items to sort
589
+ //!
590
+ //! @param decomposer
591
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
592
+ //! references to its constituent arithmetic types. The leftmost element of
593
+ //! the tuple is considered the most significant. The call operator must not
594
+ //! modify members of the key.
595
+ //!
596
+ //! @param[in] stream
597
+ //! **[optional]** CUDA stream to launch kernels within.
598
+ //! Default is stream<sub>0</sub>.
599
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
600
+ CUB_RUNTIME_FUNCTION static //
601
+ ::cuda::std::enable_if_t< //
602
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
603
+ cudaError_t>
604
+ SortPairs(void* d_temp_storage,
605
+ size_t& temp_storage_bytes,
606
+ const KeyT* d_keys_in,
607
+ KeyT* d_keys_out,
608
+ const ValueT* d_values_in,
609
+ ValueT* d_values_out,
610
+ NumItemsT num_items,
611
+ DecomposerT decomposer,
612
+ cudaStream_t stream = 0)
613
+ {
614
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
615
+ // unsigned integer type for global offsets
616
+ using offset_t = detail::choose_offset_t<NumItemsT>;
617
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
618
+
619
+ static_assert(decomposer_check_t::value,
620
+ "DecomposerT must be a callable object returning a tuple of references to "
621
+ "arithmetic types");
622
+
623
+ // We cast away const-ness, but will *not* write to these arrays.
624
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
625
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
626
+ // is not set.
627
+ constexpr bool is_overwrite_okay = false;
628
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
629
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
630
+
631
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
632
+ decomposer_check_t{},
633
+ d_temp_storage,
634
+ temp_storage_bytes,
635
+ is_overwrite_okay,
636
+ d_keys,
637
+ d_values,
638
+ static_cast<offset_t>(num_items),
639
+ decomposer,
640
+ stream);
641
+ }
642
+
643
+ //! @rst
644
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
645
+ //!
646
+ //! - The sorting operation is given a pair of key buffers and a corresponding
647
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
648
+ //! structure that indicates which of the two buffers is "current" (and thus
649
+ //! contains the input data to be sorted).
650
+ //! - The contents of both buffers within each pair may be altered by the
651
+ //! sorting operation.
652
+ //! - In-place operations are not supported. There must be no overlap between
653
+ //! any of the provided ranges:
654
+ //!
655
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
656
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
657
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
658
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
659
+ //!
660
+ //! - Upon completion, the sorting operation will update the "current"
661
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
662
+ //! buffers now contains the sorted output sequence (a function of the
663
+ //! number of key bits specified and the targeted device architecture).
664
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
665
+ //! bits can be specified. This can reduce overall sorting overhead and
666
+ //! yield a corresponding performance improvement.
667
+ //! - @devicestorageP
668
+ //! - @devicestorage
669
+ //!
670
+ //! Snippet
671
+ //! --------------------------------------------------
672
+ //!
673
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
674
+ //! keys with associated vector of ``int`` values.
675
+ //! @endrst
676
+ //!
677
+ //! @code
678
+ //! #include <cub/cub.cuh>
679
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
680
+ //!
681
+ //! // Declare, allocate, and initialize device-accessible pointers for
682
+ //! // sorting data
683
+ //! int num_items; // e.g., 7
684
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
685
+ //! int *d_key_alt_buf; // e.g., [ ... ]
686
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
687
+ //! int *d_value_alt_buf; // e.g., [ ... ]
688
+ //! ...
689
+ //!
690
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
691
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
692
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
693
+ //!
694
+ //! // Determine temporary device storage requirements
695
+ //! void *d_temp_storage = nullptr;
696
+ //! size_t temp_storage_bytes = 0;
697
+ //! cub::DeviceRadixSort::SortPairs(
698
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
699
+ //!
700
+ //! // Allocate temporary storage
701
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
702
+ //!
703
+ //! // Run sorting operation
704
+ //! cub::DeviceRadixSort::SortPairs(
705
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
706
+ //!
707
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
708
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
709
+ //!
710
+ //! @endcode
711
+ //!
712
+ //! @tparam KeyT
713
+ //! **[inferred]** KeyT type
714
+ //!
715
+ //! @tparam ValueT
716
+ //! **[inferred]** ValueT type
717
+ //!
718
+ //! @tparam NumItemsT
719
+ //! **[inferred]** Type of num_items
720
+ //!
721
+ //! @param[in] d_temp_storage
722
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
723
+ //! required allocation size is written to ``temp_storage_bytes`` and no work is done.
724
+ //!
725
+ //! @param[in,out] temp_storage_bytes
726
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
727
+ //!
728
+ //! @param[in,out] d_keys
729
+ //! Reference to the double-buffer of keys whose "current" device-accessible
730
+ //! buffer contains the unsorted input keys and, upon return, is updated to
731
+ //! point to the sorted output keys
732
+ //!
733
+ //! @param[in,out] d_values
734
+ //! Double-buffer of values whose "current" device-accessible buffer
735
+ //! contains the unsorted input values and, upon return, is updated to point
736
+ //! to the sorted output values
737
+ //!
738
+ //! @param[in] num_items
739
+ //! Number of items to sort
740
+ //!
741
+ //! @param[in] begin_bit
742
+ //! **[optional]** The least-significant bit index (inclusive) needed for
743
+ //! key comparison
744
+ //!
745
+ //! @param[in] end_bit
746
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
747
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
748
+ //!
749
+ //! @param[in] stream
750
+ //! **[optional]** CUDA stream to launch kernels within.
751
+ //! Default is stream<sub>0</sub>.
752
+ template <typename KeyT, typename ValueT, typename NumItemsT>
753
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
754
+ void* d_temp_storage,
755
+ size_t& temp_storage_bytes,
756
+ DoubleBuffer<KeyT>& d_keys,
757
+ DoubleBuffer<ValueT>& d_values,
758
+ NumItemsT num_items,
759
+ int begin_bit = 0,
760
+ int end_bit = sizeof(KeyT) * 8,
761
+ cudaStream_t stream = 0)
762
+ {
763
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
764
+
765
+ // Unsigned integer type for global offsets.
766
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
767
+
768
+ constexpr bool is_overwrite_okay = true;
769
+
770
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
771
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
772
+ }
773
+
774
+ //! @rst
775
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
776
+ //!
777
+ //! * The sorting operation is given a pair of key buffers and a corresponding
778
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
779
+ //! structure that indicates which of the two buffers is "current" (and thus
780
+ //! contains the input data to be sorted).
781
+ //! * The contents of both buffers within each pair may be altered by the
782
+ //! sorting operation.
783
+ //! * In-place operations are not supported. There must be no overlap between
784
+ //! any of the provided ranges:
785
+ //!
786
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
787
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
788
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
789
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
790
+ //!
791
+ //! - Upon completion, the sorting operation will update the "current"
792
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
793
+ //! buffers now contains the sorted output sequence (a function of the
794
+ //! number of key bits specified and the targeted device architecture).
795
+ //! - @devicestorageP
796
+ //! - @devicestorage
797
+ //!
798
+ //! Snippet
799
+ //! --------------------------------------------------
800
+ //!
801
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
802
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
803
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
804
+ //! tuple of references to relevant members of the key.
805
+ //!
806
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
807
+ //! :language: c++
808
+ //! :dedent:
809
+ //! :start-after: example-begin custom-type
810
+ //! :end-before: example-end custom-type
811
+ //!
812
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
813
+ //! using ``cub::DeviceRadixSort::SortPairs``:
814
+ //!
815
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
816
+ //! :language: c++
817
+ //! :dedent:
818
+ //! :start-after: example-begin pairs-db
819
+ //! :end-before: example-end pairs-db
820
+ //!
821
+ //! @endrst
822
+ //!
823
+ //! @tparam KeyT
824
+ //! **[inferred]** KeyT type
825
+ //!
826
+ //! @tparam ValueT
827
+ //! **[inferred]** ValueT type
828
+ //!
829
+ //! @tparam NumItemsT
830
+ //! **[inferred]** Type of num_items
831
+ //!
832
+ //! @tparam DecomposerT
833
+ //! **[inferred]** Type of a callable object responsible for decomposing a
834
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
835
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
836
+ //! The leftmost element of the tuple is considered the most significant.
837
+ //! The call operator must not modify members of the key.
838
+ //!
839
+ //! @param[in] d_temp_storage
840
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
841
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
842
+ //! is done.
843
+ //!
844
+ //! @param[in,out] temp_storage_bytes
845
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
846
+ //!
847
+ //! @param[in,out] d_keys
848
+ //! Reference to the double-buffer of keys whose "current" device-accessible
849
+ //! buffer contains the unsorted input keys and, upon return, is updated to
850
+ //! point to the sorted output keys
851
+ //!
852
+ //! @param[in,out] d_values
853
+ //! Double-buffer of values whose "current" device-accessible buffer
854
+ //! contains the unsorted input values and, upon return, is updated to point
855
+ //! to the sorted output values
856
+ //!
857
+ //! @param[in] num_items
858
+ //! Number of items to sort
859
+ //!
860
+ //! @param decomposer
861
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
862
+ //! references to its constituent arithmetic types. The leftmost element of
863
+ //! the tuple is considered the most significant. The call operator must not
864
+ //! modify members of the key.
865
+ //!
866
+ //! @param[in] stream
867
+ //! **[optional]** CUDA stream to launch kernels within.
868
+ //! Default is stream<sub>0</sub>.
869
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
870
+ CUB_RUNTIME_FUNCTION static //
871
+ ::cuda::std::enable_if_t< //
872
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
873
+ cudaError_t>
874
+ SortPairs(void* d_temp_storage,
875
+ size_t& temp_storage_bytes,
876
+ DoubleBuffer<KeyT>& d_keys,
877
+ DoubleBuffer<ValueT>& d_values,
878
+ NumItemsT num_items,
879
+ DecomposerT decomposer,
880
+ cudaStream_t stream = 0)
881
+ {
882
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
883
+
884
+ // unsigned integer type for global offsets
885
+ using offset_t = detail::choose_offset_t<NumItemsT>;
886
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
887
+
888
+ static_assert(decomposer_check_t::value,
889
+ "DecomposerT must be a callable object returning a tuple of references to "
890
+ "arithmetic types");
891
+
892
+ constexpr bool is_overwrite_okay = true;
893
+
894
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
895
+ decomposer_check_t{},
896
+ d_temp_storage,
897
+ temp_storage_bytes,
898
+ is_overwrite_okay,
899
+ d_keys,
900
+ d_values,
901
+ static_cast<offset_t>(num_items),
902
+ decomposer,
903
+ stream);
904
+ }
905
+
906
+ //! @rst
907
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
908
+ //!
909
+ //! * The sorting operation is given a pair of key buffers and a corresponding
910
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
911
+ //! structure that indicates which of the two buffers is "current" (and thus
912
+ //! contains the input data to be sorted).
913
+ //! * The contents of both buffers within each pair may be altered by the
914
+ //! sorting operation.
915
+ //! * In-place operations are not supported. There must be no overlap between
916
+ //! any of the provided ranges:
917
+ //!
918
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
919
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
920
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
921
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
922
+ //!
923
+ //! - Upon completion, the sorting operation will update the "current"
924
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
925
+ //! buffers now contains the sorted output sequence (a function of the
926
+ //! number of key bits specified and the targeted device architecture).
927
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
928
+ //! bits can be specified. This can reduce overall sorting overhead and
929
+ //! yield a corresponding performance improvement.
930
+ //! - @devicestorageP
931
+ //! - @devicestorage
932
+ //!
933
+ //! Snippet
934
+ //! --------------------------------------------------
935
+ //!
936
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
937
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
938
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
939
+ //! tuple of references to relevant members of the key.
940
+ //!
941
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
942
+ //! :language: c++
943
+ //! :dedent:
944
+ //! :start-after: example-begin custom-type
945
+ //! :end-before: example-end custom-type
946
+ //!
947
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
948
+ //! using ``cub::DeviceRadixSort::SortPairs``:
949
+ //!
950
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
951
+ //! :language: c++
952
+ //! :dedent:
953
+ //! :start-after: example-begin pairs-bits-db
954
+ //! :end-before: example-end pairs-bits-db
955
+ //!
956
+ //! @endrst
957
+ //!
958
+ //! @tparam KeyT
959
+ //! **[inferred]** KeyT type
960
+ //!
961
+ //! @tparam ValueT
962
+ //! **[inferred]** ValueT type
963
+ //!
964
+ //! @tparam NumItemsT
965
+ //! **[inferred]** Type of num_items
966
+ //!
967
+ //! @tparam DecomposerT
968
+ //! **[inferred]** Type of a callable object responsible for decomposing a
969
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
970
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
971
+ //! The leftmost element of the tuple is considered the most significant.
972
+ //! The call operator must not modify members of the key.
973
+ //!
974
+ //! @param[in] d_temp_storage
975
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
976
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
977
+ //! is done.
978
+ //!
979
+ //! @param[in,out] temp_storage_bytes
980
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
981
+ //!
982
+ //! @param[in,out] d_keys
983
+ //! Reference to the double-buffer of keys whose "current" device-accessible
984
+ //! buffer contains the unsorted input keys and, upon return, is updated to
985
+ //! point to the sorted output keys
986
+ //!
987
+ //! @param[in,out] d_values
988
+ //! Double-buffer of values whose "current" device-accessible buffer
989
+ //! contains the unsorted input values and, upon return, is updated to point
990
+ //! to the sorted output values
991
+ //!
992
+ //! @param[in] num_items
993
+ //! Number of items to sort
994
+ //!
995
+ //! @param decomposer
996
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
997
+ //! references to its constituent arithmetic types. The leftmost element of
998
+ //! the tuple is considered the most significant. The call operator must not
999
+ //! modify members of the key.
1000
+ //!
1001
+ //! @param[in] begin_bit
1002
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1003
+ //! key comparison
1004
+ //!
1005
+ //! @param[in] end_bit
1006
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1007
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1008
+ //!
1009
+ //! @param[in] stream
1010
+ //! **[optional]** CUDA stream to launch kernels within.
1011
+ //! Default is stream<sub>0</sub>.
1012
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1013
+ CUB_RUNTIME_FUNCTION static //
1014
+ ::cuda::std::enable_if_t< //
1015
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1016
+ cudaError_t>
1017
+ SortPairs(void* d_temp_storage,
1018
+ size_t& temp_storage_bytes,
1019
+ DoubleBuffer<KeyT>& d_keys,
1020
+ DoubleBuffer<ValueT>& d_values,
1021
+ NumItemsT num_items,
1022
+ DecomposerT decomposer,
1023
+ int begin_bit,
1024
+ int end_bit,
1025
+ cudaStream_t stream = 0)
1026
+ {
1027
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1028
+
1029
+ // unsigned integer type for global offsets
1030
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1031
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1032
+
1033
+ static_assert(decomposer_check_t::value,
1034
+ "DecomposerT must be a callable object returning a tuple of references to "
1035
+ "arithmetic types");
1036
+
1037
+ constexpr bool is_overwrite_okay = true;
1038
+
1039
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
1040
+ decomposer_check_t{},
1041
+ d_temp_storage,
1042
+ temp_storage_bytes,
1043
+ is_overwrite_okay,
1044
+ d_keys,
1045
+ d_values,
1046
+ static_cast<offset_t>(num_items),
1047
+ decomposer,
1048
+ begin_bit,
1049
+ end_bit,
1050
+ stream);
1051
+ }
1052
+
1053
+ //! @rst
1054
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1055
+ //!
1056
+ //! - The contents of the input data are not altered by the sorting operation.
1057
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1058
+ //! supported.
1059
+ //! - In-place operations are not supported. There must be no overlap between
1060
+ //! any of the provided ranges:
1061
+ //!
1062
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1063
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1064
+ //! - ``[d_values_in, d_values_in + num_items)``
1065
+ //! - ``[d_values_out, d_values_out + num_items)``
1066
+ //!
1067
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1068
+ //! bits can be specified. This can reduce overall sorting overhead and
1069
+ //! yield a corresponding performance improvement.
1070
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1071
+ //! the sorting interface using DoubleBuffer wrappers below.
1072
+ //! - @devicestorage
1073
+ //!
1074
+ //! Snippet
1075
+ //! --------------------------------------------------
1076
+ //!
1077
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1078
+ //! keys with associated vector of ``int`` values.
1079
+ //! @endrst
1080
+ //!
1081
+ //! @code{.cpp}
1082
+ //! #include <cub/cub.cuh>
1083
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1084
+ //!
1085
+ //! // Declare, allocate, and initialize device-accessible pointers
1086
+ //! // for sorting data
1087
+ //! int num_items; // e.g., 7
1088
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1089
+ //! int *d_keys_out; // e.g., [ ... ]
1090
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1091
+ //! int *d_values_out; // e.g., [ ... ]
1092
+ //! ...
1093
+ //!
1094
+ //! // Determine temporary device storage requirements
1095
+ //! void *d_temp_storage = nullptr;
1096
+ //! size_t temp_storage_bytes = 0;
1097
+ //! cub::DeviceRadixSort::SortPairsDescending(
1098
+ //! d_temp_storage, temp_storage_bytes,
1099
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1100
+ //!
1101
+ //! // Allocate temporary storage
1102
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1103
+ //!
1104
+ //! // Run sorting operation
1105
+ //! cub::DeviceRadixSort::SortPairsDescending(
1106
+ //! d_temp_storage, temp_storage_bytes,
1107
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1108
+ //!
1109
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
1110
+ //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
1111
+ //! @endcode
1112
+ //!
1113
+ //! @tparam KeyT
1114
+ //! **[inferred]** KeyT type
1115
+ //!
1116
+ //! @tparam ValueT
1117
+ //! **[inferred]** ValueT type
1118
+ //!
1119
+ //! @tparam NumItemsT
1120
+ //! **[inferred]** Type of num_items
1121
+ //!
1122
+ //! @param[in] d_temp_storage
1123
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1124
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1125
+ //! is done.
1126
+ //!
1127
+ //! @param[in,out] temp_storage_bytes
1128
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1129
+ //!
1130
+ //! @param[in] d_keys_in
1131
+ //! Pointer to the input data of key data to sort
1132
+ //!
1133
+ //! @param[out] d_keys_out
1134
+ //! Pointer to the sorted output sequence of key data
1135
+ //!
1136
+ //! @param[in] d_values_in
1137
+ //! Pointer to the corresponding input sequence of associated value items
1138
+ //!
1139
+ //! @param[out] d_values_out
1140
+ //! Pointer to the correspondingly-reordered output sequence of associated
1141
+ //! value items
1142
+ //!
1143
+ //! @param[in] num_items
1144
+ //! Number of items to sort
1145
+ //!
1146
+ //! @param[in] begin_bit
1147
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1148
+ //! key comparison
1149
+ //!
1150
+ //! @param[in] end_bit
1151
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1152
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1153
+ //!
1154
+ //! @param[in] stream
1155
+ //! **[optional]** CUDA stream to launch kernels within.
1156
+ //! Default is stream<sub>0</sub>.
1157
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1158
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1159
+ void* d_temp_storage,
1160
+ size_t& temp_storage_bytes,
1161
+ const KeyT* d_keys_in,
1162
+ KeyT* d_keys_out,
1163
+ const ValueT* d_values_in,
1164
+ ValueT* d_values_out,
1165
+ NumItemsT num_items,
1166
+ int begin_bit = 0,
1167
+ int end_bit = sizeof(KeyT) * 8,
1168
+ cudaStream_t stream = 0)
1169
+ {
1170
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1171
+
1172
+ // Unsigned integer type for global offsets.
1173
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1174
+
1175
+ // We cast away const-ness, but will *not* write to these arrays.
1176
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1177
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1178
+ // is not set.
1179
+ constexpr bool is_overwrite_okay = false;
1180
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1181
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1182
+
1183
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1184
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1185
+ }
1186
+
1187
+ //! @rst
1188
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1189
+ //!
1190
+ //! * The contents of the input data are not altered by the sorting operation.
1191
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1192
+ //! supported.
1193
+ //! * In-place operations are not supported. There must be no overlap between
1194
+ //! any of the provided ranges:
1195
+ //!
1196
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1197
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1198
+ //! * ``[d_values_in, d_values_in + num_items)``
1199
+ //! * ``[d_values_out, d_values_out + num_items)``
1200
+ //!
1201
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
1202
+ //! differentiating key bits. This can reduce overall sorting overhead and
1203
+ //! yield a corresponding performance improvement.
1204
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1205
+ //! the sorting interface using DoubleBuffer wrappers below.
1206
+ //! * @devicestorage
1207
+ //!
1208
+ //! Snippet
1209
+ //! --------------------------------------------------
1210
+ //!
1211
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1212
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1213
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1214
+ //! tuple of references to relevant members of the key.
1215
+ //!
1216
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1217
+ //! :language: c++
1218
+ //! :dedent:
1219
+ //! :start-after: example-begin custom-type
1220
+ //! :end-before: example-end custom-type
1221
+ //!
1222
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1223
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1224
+ //!
1225
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1226
+ //! :language: c++
1227
+ //! :dedent:
1228
+ //! :start-after: example-begin pairs-descending-bits
1229
+ //! :end-before: example-end pairs-descending-bits
1230
+ //!
1231
+ //! @endrst
1232
+ //!
1233
+ //! @tparam KeyT
1234
+ //! **[inferred]** KeyT type
1235
+ //!
1236
+ //! @tparam ValueT
1237
+ //! **[inferred]** ValueT type
1238
+ //!
1239
+ //! @tparam NumItemsT
1240
+ //! **[inferred]** Type of num_items
1241
+ //!
1242
+ //! @tparam DecomposerT
1243
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1244
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1245
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1246
+ //! The leftmost element of the tuple is considered the most significant.
1247
+ //! The call operator must not modify members of the key.
1248
+ //!
1249
+ //! @param[in] d_temp_storage
1250
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1251
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1252
+ //! is done.
1253
+ //!
1254
+ //! @param[in,out] temp_storage_bytes
1255
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1256
+ //!
1257
+ //! @param[in] d_keys_in
1258
+ //! Pointer to the input data of key data to sort
1259
+ //!
1260
+ //! @param[out] d_keys_out
1261
+ //! Pointer to the sorted output sequence of key data
1262
+ //!
1263
+ //! @param[in] d_values_in
1264
+ //! Pointer to the corresponding input sequence of associated value items
1265
+ //!
1266
+ //! @param[out] d_values_out
1267
+ //! Pointer to the correspondingly-reordered output sequence of associated
1268
+ //! value items
1269
+ //!
1270
+ //! @param[in] num_items
1271
+ //! Number of items to sort
1272
+ //!
1273
+ //! @param decomposer
1274
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1275
+ //! references to its constituent arithmetic types. The leftmost element of
1276
+ //! the tuple is considered the most significant. The call operator must not
1277
+ //! modify members of the key.
1278
+ //!
1279
+ //! @param[in] begin_bit
1280
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1281
+ //! key comparison
1282
+ //!
1283
+ //! @param[in] end_bit
1284
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1285
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1286
+ //!
1287
+ //! @param[in] stream
1288
+ //! **[optional]** CUDA stream to launch kernels within.
1289
+ //! Default is stream<sub>0</sub>.
1290
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1291
+ CUB_RUNTIME_FUNCTION static //
1292
+ ::cuda::std::enable_if_t< //
1293
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1294
+ cudaError_t>
1295
+ SortPairsDescending(
1296
+ void* d_temp_storage,
1297
+ size_t& temp_storage_bytes,
1298
+ const KeyT* d_keys_in,
1299
+ KeyT* d_keys_out,
1300
+ const ValueT* d_values_in,
1301
+ ValueT* d_values_out,
1302
+ NumItemsT num_items,
1303
+ DecomposerT decomposer,
1304
+ int begin_bit,
1305
+ int end_bit,
1306
+ cudaStream_t stream = 0)
1307
+ {
1308
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1309
+
1310
+ // unsigned integer type for global offsets
1311
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1312
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1313
+
1314
+ static_assert(decomposer_check_t::value,
1315
+ "DecomposerT must be a callable object returning a tuple of references to "
1316
+ "arithmetic types");
1317
+
1318
+ // We cast away const-ness, but will *not* write to these arrays.
1319
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1320
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1321
+ // is not set.
1322
+ constexpr bool is_overwrite_okay = false;
1323
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1324
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1325
+
1326
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1327
+ decomposer_check_t{},
1328
+ d_temp_storage,
1329
+ temp_storage_bytes,
1330
+ is_overwrite_okay,
1331
+ d_keys,
1332
+ d_values,
1333
+ static_cast<offset_t>(num_items),
1334
+ decomposer,
1335
+ begin_bit,
1336
+ end_bit,
1337
+ stream);
1338
+ }
1339
+
1340
+ //! @rst
1341
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1342
+ //!
1343
+ //! * The contents of the input data are not altered by the sorting operation.
1344
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1345
+ //! supported.
1346
+ //! * In-place operations are not supported. There must be no overlap between
1347
+ //! any of the provided ranges:
1348
+ //!
1349
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1350
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1351
+ //! * ``[d_values_in, d_values_in + num_items)``
1352
+ //! * ``[d_values_out, d_values_out + num_items)``
1353
+ //!
1354
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1355
+ //! the sorting interface using DoubleBuffer wrappers below.
1356
+ //! * @devicestorage
1357
+ //!
1358
+ //! Snippet
1359
+ //! --------------------------------------------------
1360
+ //!
1361
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1362
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1363
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1364
+ //! tuple of references to relevant members of the key.
1365
+ //!
1366
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1367
+ //! :language: c++
1368
+ //! :dedent:
1369
+ //! :start-after: example-begin custom-type
1370
+ //! :end-before: example-end custom-type
1371
+ //!
1372
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1373
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1374
+ //!
1375
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1376
+ //! :language: c++
1377
+ //! :dedent:
1378
+ //! :start-after: example-begin pairs-descending
1379
+ //! :end-before: example-end pairs-descending
1380
+ //!
1381
+ //! @endrst
1382
+ //!
1383
+ //! @tparam KeyT
1384
+ //! **[inferred]** KeyT type
1385
+ //!
1386
+ //! @tparam ValueT
1387
+ //! **[inferred]** ValueT type
1388
+ //!
1389
+ //! @tparam NumItemsT
1390
+ //! **[inferred]** Type of num_items
1391
+ //!
1392
+ //! @tparam DecomposerT
1393
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1394
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1395
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1396
+ //! The leftmost element of the tuple is considered the most significant.
1397
+ //! The call operator must not modify members of the key.
1398
+ //!
1399
+ //! @param[in] d_temp_storage
1400
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1401
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1402
+ //! is done.
1403
+ //!
1404
+ //! @param[in,out] temp_storage_bytes
1405
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1406
+ //!
1407
+ //! @param[in] d_keys_in
1408
+ //! Pointer to the input data of key data to sort
1409
+ //!
1410
+ //! @param[out] d_keys_out
1411
+ //! Pointer to the sorted output sequence of key data
1412
+ //!
1413
+ //! @param[in] d_values_in
1414
+ //! Pointer to the corresponding input sequence of associated value items
1415
+ //!
1416
+ //! @param[out] d_values_out
1417
+ //! Pointer to the correspondingly-reordered output sequence of associated
1418
+ //! value items
1419
+ //!
1420
+ //! @param[in] num_items
1421
+ //! Number of items to sort
1422
+ //!
1423
+ //! @param decomposer
1424
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1425
+ //! references to its constituent arithmetic types. The leftmost element of
1426
+ //! the tuple is considered the most significant. The call operator must not
1427
+ //! modify members of the key.
1428
+ //!
1429
+ //! @param[in] stream
1430
+ //! **[optional]** CUDA stream to launch kernels within.
1431
+ //! Default is stream<sub>0</sub>.
1432
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1433
+ CUB_RUNTIME_FUNCTION static //
1434
+ ::cuda::std::enable_if_t< //
1435
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1436
+ cudaError_t>
1437
+ SortPairsDescending(
1438
+ void* d_temp_storage,
1439
+ size_t& temp_storage_bytes,
1440
+ const KeyT* d_keys_in,
1441
+ KeyT* d_keys_out,
1442
+ const ValueT* d_values_in,
1443
+ ValueT* d_values_out,
1444
+ NumItemsT num_items,
1445
+ DecomposerT decomposer,
1446
+ cudaStream_t stream = 0)
1447
+ {
1448
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1449
+
1450
+ // unsigned integer type for global offsets
1451
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1452
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1453
+
1454
+ static_assert(decomposer_check_t::value,
1455
+ "DecomposerT must be a callable object returning a tuple of references to "
1456
+ "arithmetic types");
1457
+
1458
+ // We cast away const-ness, but will *not* write to these arrays.
1459
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
1460
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
1461
+ // is not set.
1462
+ constexpr bool is_overwrite_okay = false;
1463
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1464
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1465
+
1466
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1467
+ decomposer_check_t{},
1468
+ d_temp_storage,
1469
+ temp_storage_bytes,
1470
+ is_overwrite_okay,
1471
+ d_keys,
1472
+ d_values,
1473
+ static_cast<offset_t>(num_items),
1474
+ decomposer,
1475
+ stream);
1476
+ }
1477
+
1478
+ //! @rst
1479
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1480
+ //!
1481
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1482
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1483
+ //! structure that indicates which of the two buffers is "current" (and thus
1484
+ //! contains the input data to be sorted).
1485
+ //! - The contents of both buffers within each pair may be altered by the
1486
+ //! sorting operation.
1487
+ //! - In-place operations are not supported. There must be no overlap between
1488
+ //! any of the provided ranges:
1489
+ //!
1490
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1491
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1492
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1493
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1494
+ //!
1495
+ //! - Upon completion, the sorting operation will update the "current"
1496
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1497
+ //! buffers now contains the sorted output sequence (a function of the number
1498
+ //! of key bits specified and the targeted device architecture).
1499
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1500
+ //! bits can be specified. This can reduce overall sorting overhead and
1501
+ //! yield a corresponding performance improvement.
1502
+ //! - @devicestorageP
1503
+ //! - @devicestorage
1504
+ //!
1505
+ //! Snippet
1506
+ //! --------------------------------------------------
1507
+ //!
1508
+ //! The code snippet below illustrates the sorting of a device vector of ``int``
1509
+ //! keys with associated vector of ``int`` values.
1510
+ //! @endrst
1511
+ //!
1512
+ //! @code{.cpp}
1513
+ //! #include <cub/cub.cuh>
1514
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1515
+ //!
1516
+ //! // Declare, allocate, and initialize device-accessible pointers
1517
+ //! // for sorting data
1518
+ //! int num_items; // e.g., 7
1519
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1520
+ //! int *d_key_alt_buf; // e.g., [ ... ]
1521
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1522
+ //! int *d_value_alt_buf; // e.g., [ ... ]
1523
+ //! ...
1524
+ //!
1525
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1526
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1527
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1528
+ //!
1529
+ //! // Determine temporary device storage requirements
1530
+ //! void *d_temp_storage = nullptr;
1531
+ //! size_t temp_storage_bytes = 0;
1532
+ //! cub::DeviceRadixSort::SortPairsDescending(
1533
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1534
+ //!
1535
+ //! // Allocate temporary storage
1536
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1537
+ //!
1538
+ //! // Run sorting operation
1539
+ //! cub::DeviceRadixSort::SortPairsDescending(
1540
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1541
+ //!
1542
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
1543
+ //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
1544
+ //! @endcode
1545
+ //!
1546
+ //! @tparam KeyT
1547
+ //! **[inferred]** KeyT type
1548
+ //!
1549
+ //! @tparam ValueT
1550
+ //! **[inferred]** ValueT type
1551
+ //!
1552
+ //! @tparam NumItemsT
1553
+ //! **[inferred]** Type of num_items
1554
+ //!
1555
+ //! @param[in] d_temp_storage
1556
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1557
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1558
+ //! is done.
1559
+ //!
1560
+ //! @param[in,out] temp_storage_bytes
1561
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1562
+ //!
1563
+ //! @param[in,out] d_keys
1564
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1565
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1566
+ //! point to the sorted output keys
1567
+ //!
1568
+ //! @param[in,out] d_values
1569
+ //! Double-buffer of values whose "current" device-accessible buffer
1570
+ //! contains the unsorted input values and, upon return, is updated to point
1571
+ //! to the sorted output values
1572
+ //!
1573
+ //! @param[in] num_items
1574
+ //! Number of items to sort
1575
+ //!
1576
+ //! @param[in] begin_bit
1577
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1578
+ //! key comparison
1579
+ //!
1580
+ //! @param[in] end_bit
1581
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1582
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1583
+ //!
1584
+ //! @param[in] stream
1585
+ //! **[optional]** CUDA stream to launch kernels within.
1586
+ //! Default is stream<sub>0</sub>.
1587
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1588
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1589
+ void* d_temp_storage,
1590
+ size_t& temp_storage_bytes,
1591
+ DoubleBuffer<KeyT>& d_keys,
1592
+ DoubleBuffer<ValueT>& d_values,
1593
+ NumItemsT num_items,
1594
+ int begin_bit = 0,
1595
+ int end_bit = sizeof(KeyT) * 8,
1596
+ cudaStream_t stream = 0)
1597
+ {
1598
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1599
+
1600
+ // Unsigned integer type for global offsets.
1601
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1602
+
1603
+ constexpr bool is_overwrite_okay = true;
1604
+
1605
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1606
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1607
+ }
1608
+
1609
+ //! @rst
1610
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1611
+ //!
1612
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1613
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1614
+ //! structure that indicates which of the two buffers is "current" (and thus
1615
+ //! contains the input data to be sorted).
1616
+ //! * The contents of both buffers within each pair may be altered by the
1617
+ //! sorting operation.
1618
+ //! * In-place operations are not supported. There must be no overlap between
1619
+ //! any of the provided ranges:
1620
+ //!
1621
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1622
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1623
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1624
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1625
+ //!
1626
+ //! - Upon completion, the sorting operation will update the "current"
1627
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1628
+ //! buffers now contains the sorted output sequence (a function of the
1629
+ //! number of key bits specified and the targeted device architecture).
1630
+ //! - @devicestorageP
1631
+ //! - @devicestorage
1632
+ //!
1633
+ //! Snippet
1634
+ //! --------------------------------------------------
1635
+ //!
1636
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1637
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1638
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1639
+ //! tuple of references to relevant members of the key.
1640
+ //!
1641
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1642
+ //! :language: c++
1643
+ //! :dedent:
1644
+ //! :start-after: example-begin custom-type
1645
+ //! :end-before: example-end custom-type
1646
+ //!
1647
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1648
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1649
+ //!
1650
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1651
+ //! :language: c++
1652
+ //! :dedent:
1653
+ //! :start-after: example-begin pairs-descending-db
1654
+ //! :end-before: example-end pairs-descending-db
1655
+ //!
1656
+ //! @endrst
1657
+ //!
1658
+ //! @tparam KeyT
1659
+ //! **[inferred]** KeyT type
1660
+ //!
1661
+ //! @tparam ValueT
1662
+ //! **[inferred]** ValueT type
1663
+ //!
1664
+ //! @tparam NumItemsT
1665
+ //! **[inferred]** Type of num_items
1666
+ //!
1667
+ //! @tparam DecomposerT
1668
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1669
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1670
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1671
+ //! The leftmost element of the tuple is considered the most significant.
1672
+ //! The call operator must not modify members of the key.
1673
+ //!
1674
+ //! @param[in] d_temp_storage
1675
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1676
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1677
+ //! is done.
1678
+ //!
1679
+ //! @param[in,out] temp_storage_bytes
1680
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1681
+ //!
1682
+ //! @param[in,out] d_keys
1683
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1684
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1685
+ //! point to the sorted output keys
1686
+ //!
1687
+ //! @param[in,out] d_values
1688
+ //! Double-buffer of values whose "current" device-accessible buffer
1689
+ //! contains the unsorted input values and, upon return, is updated to point
1690
+ //! to the sorted output values
1691
+ //!
1692
+ //! @param[in] num_items
1693
+ //! Number of items to sort
1694
+ //!
1695
+ //! @param decomposer
1696
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1697
+ //! references to its constituent arithmetic types. The leftmost element of
1698
+ //! the tuple is considered the most significant. The call operator must not
1699
+ //! modify members of the key.
1700
+ //!
1701
+ //! @param[in] stream
1702
+ //! **[optional]** CUDA stream to launch kernels within.
1703
+ //! Default is stream<sub>0</sub>.
1704
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1705
+ CUB_RUNTIME_FUNCTION static //
1706
+ ::cuda::std::enable_if_t< //
1707
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1708
+ cudaError_t>
1709
+ SortPairsDescending(
1710
+ void* d_temp_storage,
1711
+ size_t& temp_storage_bytes,
1712
+ DoubleBuffer<KeyT>& d_keys,
1713
+ DoubleBuffer<ValueT>& d_values,
1714
+ NumItemsT num_items,
1715
+ DecomposerT decomposer,
1716
+ cudaStream_t stream = 0)
1717
+ {
1718
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1719
+
1720
+ // unsigned integer type for global offsets
1721
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1722
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1723
+
1724
+ static_assert(decomposer_check_t::value,
1725
+ "DecomposerT must be a callable object returning a tuple of references to "
1726
+ "arithmetic types");
1727
+
1728
+ constexpr bool is_overwrite_okay = true;
1729
+
1730
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1731
+ decomposer_check_t{},
1732
+ d_temp_storage,
1733
+ temp_storage_bytes,
1734
+ is_overwrite_okay,
1735
+ d_keys,
1736
+ d_values,
1737
+ static_cast<offset_t>(num_items),
1738
+ decomposer,
1739
+ stream);
1740
+ }
1741
+
1742
+ //! @rst
1743
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1744
+ //!
1745
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1746
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1747
+ //! structure that indicates which of the two buffers is "current" (and thus
1748
+ //! contains the input data to be sorted).
1749
+ //! * The contents of both buffers within each pair may be altered by the
1750
+ //! sorting operation.
1751
+ //! * In-place operations are not supported. There must be no overlap between
1752
+ //! any of the provided ranges:
1753
+ //!
1754
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1755
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1756
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1757
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1758
+ //!
1759
+ //! - Upon completion, the sorting operation will update the "current"
1760
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1761
+ //! buffers now contains the sorted output sequence (a function of the
1762
+ //! number of key bits specified and the targeted device architecture).
1763
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1764
+ //! bits can be specified. This can reduce overall sorting overhead and
1765
+ //! yield a corresponding performance improvement.
1766
+ //! - @devicestorageP
1767
+ //! - @devicestorage
1768
+ //!
1769
+ //! Snippet
1770
+ //! --------------------------------------------------
1771
+ //!
1772
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1773
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1774
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1775
+ //! tuple of references to relevant members of the key.
1776
+ //!
1777
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1778
+ //! :language: c++
1779
+ //! :dedent:
1780
+ //! :start-after: example-begin custom-type
1781
+ //! :end-before: example-end custom-type
1782
+ //!
1783
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1784
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1785
+ //!
1786
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1787
+ //! :language: c++
1788
+ //! :dedent:
1789
+ //! :start-after: example-begin pairs-descending-bits-db
1790
+ //! :end-before: example-end pairs-descending-bits-db
1791
+ //!
1792
+ //! @endrst
1793
+ //!
1794
+ //! @tparam KeyT
1795
+ //! **[inferred]** KeyT type
1796
+ //!
1797
+ //! @tparam ValueT
1798
+ //! **[inferred]** ValueT type
1799
+ //!
1800
+ //! @tparam NumItemsT
1801
+ //! **[inferred]** Type of num_items
1802
+ //!
1803
+ //! @tparam DecomposerT
1804
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1805
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1806
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1807
+ //! The leftmost element of the tuple is considered the most significant.
1808
+ //! The call operator must not modify members of the key.
1809
+ //!
1810
+ //! @param[in] d_temp_storage
1811
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1812
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1813
+ //! is done.
1814
+ //!
1815
+ //! @param[in,out] temp_storage_bytes
1816
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1817
+ //!
1818
+ //! @param[in,out] d_keys
1819
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1820
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1821
+ //! point to the sorted output keys
1822
+ //!
1823
+ //! @param[in,out] d_values
1824
+ //! Double-buffer of values whose "current" device-accessible buffer
1825
+ //! contains the unsorted input values and, upon return, is updated to point
1826
+ //! to the sorted output values
1827
+ //!
1828
+ //! @param[in] num_items
1829
+ //! Number of items to sort
1830
+ //!
1831
+ //! @param decomposer
1832
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1833
+ //! references to its constituent arithmetic types. The leftmost element of
1834
+ //! the tuple is considered the most significant. The call operator must not
1835
+ //! modify members of the key.
1836
+ //!
1837
+ //! @param[in] begin_bit
1838
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1839
+ //! key comparison
1840
+ //!
1841
+ //! @param[in] end_bit
1842
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1843
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
1844
+ //!
1845
+ //! @param[in] stream
1846
+ //! **[optional]** CUDA stream to launch kernels within.
1847
+ //! Default is stream<sub>0</sub>.
1848
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1849
+ CUB_RUNTIME_FUNCTION static //
1850
+ ::cuda::std::enable_if_t< //
1851
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1852
+ cudaError_t>
1853
+ SortPairsDescending(
1854
+ void* d_temp_storage,
1855
+ size_t& temp_storage_bytes,
1856
+ DoubleBuffer<KeyT>& d_keys,
1857
+ DoubleBuffer<ValueT>& d_values,
1858
+ NumItemsT num_items,
1859
+ DecomposerT decomposer,
1860
+ int begin_bit,
1861
+ int end_bit,
1862
+ cudaStream_t stream = 0)
1863
+ {
1864
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1865
+
1866
+ // unsigned integer type for global offsets
1867
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1868
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1869
+
1870
+ static_assert(decomposer_check_t::value,
1871
+ "DecomposerT must be a callable object returning a tuple of references to "
1872
+ "arithmetic types");
1873
+
1874
+ constexpr bool is_overwrite_okay = true;
1875
+
1876
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1877
+ decomposer_check_t{},
1878
+ d_temp_storage,
1879
+ temp_storage_bytes,
1880
+ is_overwrite_okay,
1881
+ d_keys,
1882
+ d_values,
1883
+ static_cast<offset_t>(num_items),
1884
+ decomposer,
1885
+ begin_bit,
1886
+ end_bit,
1887
+ stream);
1888
+ }
1889
+
1890
+ //! @} end member group
1891
+ //! @name Keys-only
1892
+ //! @{
1893
+
1894
+ //! @rst
1895
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
1896
+ //!
1897
+ //! - The contents of the input data are not altered by the sorting operation.
1898
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1899
+ //! supported.
1900
+ //! - In-place operations are not supported. There must be no overlap between
1901
+ //! any of the provided ranges:
1902
+ //!
1903
+ //! - ``[d_keys_in, d_keys_in + num_items)``
1904
+ //! - ``[d_keys_out, d_keys_out + num_items)``
1905
+ //!
1906
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1907
+ //! bits can be specified. This can reduce overall sorting overhead and
1908
+ //! yield a corresponding performance improvement.
1909
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
1910
+ //! the sorting interface using DoubleBuffer wrappers below.
1911
+ //! - @devicestorage
1912
+ //!
1913
+ //! Snippet
1914
+ //! --------------------------------------------------
1915
+ //!
1916
+ //! The code snippet below illustrates the sorting of a device vector of
1917
+ //! ``int`` keys.
1918
+ //! @endrst
1919
+ //!
1920
+ //! @code{.cpp}
1921
+ //! #include <cub/cub.cuh>
1922
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1923
+ //!
1924
+ //! // Declare, allocate, and initialize device-accessible pointers
1925
+ //! // for sorting data
1926
+ //! int num_items; // e.g., 7
1927
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1928
+ //! int *d_keys_out; // e.g., [ ... ]
1929
+ //! ...
1930
+ //!
1931
+ //! // Determine temporary device storage requirements
1932
+ //! void *d_temp_storage = nullptr;
1933
+ //! size_t temp_storage_bytes = 0;
1934
+ //! cub::DeviceRadixSort::SortKeys(
1935
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1936
+ //!
1937
+ //! // Allocate temporary storage
1938
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1939
+ //!
1940
+ //! // Run sorting operation
1941
+ //! cub::DeviceRadixSort::SortKeys(
1942
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1943
+ //!
1944
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
1945
+ //! @endcode
1946
+ //!
1947
+ //! @tparam KeyT
1948
+ //! **[inferred]** KeyT type
1949
+ //!
1950
+ //! @tparam NumItemsT
1951
+ //! **[inferred]** Type of num_items
1952
+ //!
1953
+ //! @tparam NumItemsT
1954
+ //! **[inferred]** Type of num_items
1955
+ //!
1956
+ //! @param[in] d_temp_storage
1957
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
1958
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
1959
+ //! is done.
1960
+ //!
1961
+ //! @param[in,out] temp_storage_bytes
1962
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
1963
+ //!
1964
+ //! @param[in] d_keys_in
1965
+ //! Pointer to the input data of key data to sort
1966
+ //!
1967
+ //! @param[out] d_keys_out
1968
+ //! Pointer to the sorted output sequence of key data
1969
+ //!
1970
+ //! @param[in] num_items
1971
+ //! Number of items to sort
1972
+ //!
1973
+ //! @param[in] begin_bit
1974
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1975
+ //! key comparison
1976
+ //!
1977
+ //! @param[in] end_bit
1978
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1979
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
1980
+ //!
1981
+ //! @param[in] stream
1982
+ //! **[optional]** CUDA stream to launch kernels within.
1983
+ //! Default is stream<sub>0</sub>.
1984
+ template <typename KeyT, typename NumItemsT>
1985
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
1986
+ void* d_temp_storage,
1987
+ size_t& temp_storage_bytes,
1988
+ const KeyT* d_keys_in,
1989
+ KeyT* d_keys_out,
1990
+ NumItemsT num_items,
1991
+ int begin_bit = 0,
1992
+ int end_bit = sizeof(KeyT) * 8,
1993
+ cudaStream_t stream = 0)
1994
+ {
1995
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1996
+
1997
+ // Unsigned integer type for global offsets.
1998
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1999
+
2000
+ // We cast away const-ness, but will *not* write to these arrays.
2001
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2002
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2003
+ // is not set.
2004
+ constexpr bool is_overwrite_okay = false;
2005
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2006
+ // Null value type
2007
+ DoubleBuffer<NullType> d_values;
2008
+
2009
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2010
+ d_temp_storage,
2011
+ temp_storage_bytes,
2012
+ d_keys,
2013
+ d_values,
2014
+ static_cast<OffsetT>(num_items),
2015
+ begin_bit,
2016
+ end_bit,
2017
+ is_overwrite_okay,
2018
+ stream);
2019
+ }
2020
+
2021
+ //! @rst
2022
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2023
+ //!
2024
+ //! * The contents of the input data are not altered by the sorting operation.
2025
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2026
+ //! supported.
2027
+ //! * In-place operations are not supported. There must be no overlap between
2028
+ //! any of the provided ranges:
2029
+ //!
2030
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2031
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2032
+ //!
2033
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2034
+ //! differentiating key bits. This can reduce overall sorting overhead and
2035
+ //! yield a corresponding performance improvement.
2036
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2037
+ //! the sorting interface using DoubleBuffer wrappers below.
2038
+ //! * @devicestorage
2039
+ //!
2040
+ //! Snippet
2041
+ //! --------------------------------------------------
2042
+ //!
2043
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2044
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2045
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2046
+ //! tuple of references to relevant members of the key.
2047
+ //!
2048
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2049
+ //! :language: c++
2050
+ //! :dedent:
2051
+ //! :start-after: example-begin custom-type
2052
+ //! :end-before: example-end custom-type
2053
+ //!
2054
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2055
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2056
+ //!
2057
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2058
+ //! :language: c++
2059
+ //! :dedent:
2060
+ //! :start-after: example-begin keys-bits
2061
+ //! :end-before: example-end keys-bits
2062
+ //!
2063
+ //! @endrst
2064
+ //!
2065
+ //! @tparam KeyT
2066
+ //! **[inferred]** KeyT type
2067
+ //!
2068
+ //! @tparam NumItemsT
2069
+ //! **[inferred]** Type of num_items
2070
+ //!
2071
+ //! @tparam DecomposerT
2072
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2073
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2074
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2075
+ //! The leftmost element of the tuple is considered the most significant.
2076
+ //! The call operator must not modify members of the key.
2077
+ //!
2078
+ //! @param[in] d_temp_storage
2079
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2080
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2081
+ //! is done.
2082
+ //!
2083
+ //! @param[in,out] temp_storage_bytes
2084
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2085
+ //!
2086
+ //! @param[in] d_keys_in
2087
+ //! Pointer to the input data of key data to sort
2088
+ //!
2089
+ //! @param[out] d_keys_out
2090
+ //! Pointer to the sorted output sequence of key data
2091
+ //!
2092
+ //! @param[in] num_items
2093
+ //! Number of items to sort
2094
+ //!
2095
+ //! @param decomposer
2096
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2097
+ //! references to its constituent arithmetic types. The leftmost element of
2098
+ //! the tuple is considered the most significant. The call operator must not
2099
+ //! modify members of the key.
2100
+ //!
2101
+ //! @param[in] begin_bit
2102
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2103
+ //! key comparison
2104
+ //!
2105
+ //! @param[in] end_bit
2106
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2107
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2108
+ //!
2109
+ //! @param[in] stream
2110
+ //! **[optional]** CUDA stream to launch kernels within.
2111
+ //! Default is stream<sub>0</sub>.
2112
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2113
+ CUB_RUNTIME_FUNCTION static //
2114
+ ::cuda::std::enable_if_t< //
2115
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2116
+ cudaError_t>
2117
+ SortKeys(void* d_temp_storage,
2118
+ size_t& temp_storage_bytes,
2119
+ const KeyT* d_keys_in,
2120
+ KeyT* d_keys_out,
2121
+ NumItemsT num_items,
2122
+ DecomposerT decomposer,
2123
+ int begin_bit,
2124
+ int end_bit,
2125
+ cudaStream_t stream = 0)
2126
+ {
2127
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2128
+
2129
+ // unsigned integer type for global offsets
2130
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2131
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2132
+
2133
+ static_assert(decomposer_check_t::value,
2134
+ "DecomposerT must be a callable object returning a tuple of references to "
2135
+ "arithmetic types");
2136
+
2137
+ // We cast away const-ness, but will *not* write to these arrays.
2138
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2139
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2140
+ // is not set.
2141
+ constexpr bool is_overwrite_okay = false;
2142
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2143
+ DoubleBuffer<NullType> d_values;
2144
+
2145
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2146
+ decomposer_check_t{},
2147
+ d_temp_storage,
2148
+ temp_storage_bytes,
2149
+ is_overwrite_okay,
2150
+ d_keys,
2151
+ d_values,
2152
+ static_cast<offset_t>(num_items),
2153
+ decomposer,
2154
+ begin_bit,
2155
+ end_bit,
2156
+ stream);
2157
+ }
2158
+
2159
+ //! @rst
2160
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2161
+ //!
2162
+ //! * The contents of the input data are not altered by the sorting operation.
2163
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2164
+ //! supported.
2165
+ //! * In-place operations are not supported. There must be no overlap between
2166
+ //! any of the provided ranges:
2167
+ //!
2168
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2169
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2170
+ //!
2171
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2172
+ //! bits can be specified. This can reduce overall sorting overhead and
2173
+ //! yield a corresponding performance improvement.
2174
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2175
+ //! the sorting interface using DoubleBuffer wrappers below.
2176
+ //! * @devicestorage
2177
+ //!
2178
+ //! Snippet
2179
+ //! --------------------------------------------------
2180
+ //!
2181
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2182
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2183
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2184
+ //! tuple of references to relevant members of the key.
2185
+ //!
2186
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2187
+ //! :language: c++
2188
+ //! :dedent:
2189
+ //! :start-after: example-begin custom-type
2190
+ //! :end-before: example-end custom-type
2191
+ //!
2192
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2193
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2194
+ //!
2195
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2196
+ //! :language: c++
2197
+ //! :dedent:
2198
+ //! :start-after: example-begin keys
2199
+ //! :end-before: example-end keys
2200
+ //!
2201
+ //! @endrst
2202
+ //!
2203
+ //! @tparam KeyT
2204
+ //! **[inferred]** KeyT type
2205
+ //!
2206
+ //! @tparam NumItemsT
2207
+ //! **[inferred]** Type of num_items
2208
+ //!
2209
+ //! @tparam DecomposerT
2210
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2211
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2212
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2213
+ //! The leftmost element of the tuple is considered the most significant.
2214
+ //! The call operator must not modify members of the key.
2215
+ //!
2216
+ //! @param[in] d_temp_storage
2217
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2218
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2219
+ //! is done.
2220
+ //!
2221
+ //! @param[in,out] temp_storage_bytes
2222
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2223
+ //!
2224
+ //! @param[in] d_keys_in
2225
+ //! Pointer to the input data of key data to sort
2226
+ //!
2227
+ //! @param[out] d_keys_out
2228
+ //! Pointer to the sorted output sequence of key data
2229
+ //!
2230
+ //! @param[in] num_items
2231
+ //! Number of items to sort
2232
+ //!
2233
+ //! @param decomposer
2234
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2235
+ //! references to its constituent arithmetic types. The leftmost element of
2236
+ //! the tuple is considered the most significant. The call operator must not
2237
+ //! modify members of the key.
2238
+ //!
2239
+ //! @param[in] stream
2240
+ //! **[optional]** CUDA stream to launch kernels within.
2241
+ //! Default is stream<sub>0</sub>.
2242
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2243
+ CUB_RUNTIME_FUNCTION static //
2244
+ ::cuda::std::enable_if_t< //
2245
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2246
+ cudaError_t>
2247
+ SortKeys(void* d_temp_storage,
2248
+ size_t& temp_storage_bytes,
2249
+ const KeyT* d_keys_in,
2250
+ KeyT* d_keys_out,
2251
+ NumItemsT num_items,
2252
+ DecomposerT decomposer,
2253
+ cudaStream_t stream = 0)
2254
+ {
2255
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2256
+
2257
+ // unsigned integer type for global offsets
2258
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2259
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2260
+
2261
+ static_assert(decomposer_check_t::value,
2262
+ "DecomposerT must be a callable object returning a tuple of references to "
2263
+ "arithmetic types");
2264
+
2265
+ // We cast away const-ness, but will *not* write to these arrays.
2266
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2267
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2268
+ // is not set.
2269
+ constexpr bool is_overwrite_okay = false;
2270
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2271
+ DoubleBuffer<NullType> d_values;
2272
+
2273
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2274
+ decomposer_check_t{},
2275
+ d_temp_storage,
2276
+ temp_storage_bytes,
2277
+ is_overwrite_okay,
2278
+ d_keys,
2279
+ d_values,
2280
+ static_cast<offset_t>(num_items),
2281
+ decomposer,
2282
+ stream);
2283
+ }
2284
+
2285
+ //! @rst
2286
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2287
+ //!
2288
+ //! - The sorting operation is given a pair of key buffers managed by a
2289
+ //! DoubleBuffer structure that indicates which of the two buffers is
2290
+ //! "current" (and thus contains the input data to be sorted).
2291
+ //! - The contents of both buffers may be altered by the sorting operation.
2292
+ //! - In-place operations are not supported. There must be no overlap between
2293
+ //! any of the provided ranges:
2294
+ //!
2295
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
2296
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2297
+ //!
2298
+ //! - Upon completion, the sorting operation will update the "current"
2299
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2300
+ //! buffers now contains the sorted output sequence (a function of the
2301
+ //! number of key bits specified and the targeted device architecture).
2302
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2303
+ //! bits can be specified. This can reduce overall sorting overhead and
2304
+ //! yield a corresponding performance improvement.
2305
+ //! - @devicestorageP
2306
+ //! - @devicestorage
2307
+ //!
2308
+ //! Snippet
2309
+ //! --------------------------------------------------
2310
+ //!
2311
+ //! The code snippet below illustrates the sorting of a device vector of
2312
+ //! ``int`` keys.
2313
+ //! @endrst
2314
+ //!
2315
+ //! @code{.cpp}
2316
+ //! #include <cub/cub.cuh>
2317
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2318
+ //!
2319
+ //! // Declare, allocate, and initialize device-accessible pointers
2320
+ //! // for sorting data
2321
+ //! int num_items; // e.g., 7
2322
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2323
+ //! int *d_key_alt_buf; // e.g., [ ... ]
2324
+ //! ...
2325
+ //!
2326
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2327
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2328
+ //!
2329
+ //! // Determine temporary device storage requirements
2330
+ //! void *d_temp_storage = nullptr;
2331
+ //! size_t temp_storage_bytes = 0;
2332
+ //! cub::DeviceRadixSort::SortKeys(
2333
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2334
+ //!
2335
+ //! // Allocate temporary storage
2336
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2337
+ //!
2338
+ //! // Run sorting operation
2339
+ //! cub::DeviceRadixSort::SortKeys(
2340
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2341
+ //!
2342
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
2343
+ //! @endcode
2344
+ //!
2345
+ //! @tparam KeyT
2346
+ //! **[inferred]** KeyT type
2347
+ //!
2348
+ //! @tparam NumItemsT
2349
+ //! **[inferred]** Type of num_items
2350
+ //!
2351
+ //! @param[in] d_temp_storage
2352
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2353
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2354
+ //! is done.
2355
+ //!
2356
+ //! @param[in,out] temp_storage_bytes
2357
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2358
+ //!
2359
+ //! @param[in,out] d_keys
2360
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2361
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2362
+ //! point to the sorted output keys
2363
+ //!
2364
+ //! @param[in] num_items
2365
+ //! Number of items to sort
2366
+ //!
2367
+ //! @param[in] begin_bit
2368
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2369
+ //! key comparison
2370
+ //!
2371
+ //! @param[in] end_bit
2372
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2373
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2374
+ //!
2375
+ //! @param[in] stream
2376
+ //! **[optional]** CUDA stream to launch kernels within.
2377
+ //! Default is stream<sub>0</sub>.
2378
+ template <typename KeyT, typename NumItemsT>
2379
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2380
+ void* d_temp_storage,
2381
+ size_t& temp_storage_bytes,
2382
+ DoubleBuffer<KeyT>& d_keys,
2383
+ NumItemsT num_items,
2384
+ int begin_bit = 0,
2385
+ int end_bit = sizeof(KeyT) * 8,
2386
+ cudaStream_t stream = 0)
2387
+ {
2388
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2389
+
2390
+ // Unsigned integer type for global offsets.
2391
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2392
+
2393
+ constexpr bool is_overwrite_okay = true;
2394
+
2395
+ // Null value type
2396
+ DoubleBuffer<NullType> d_values;
2397
+
2398
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2399
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2400
+ }
2401
+
2402
+ //! @rst
2403
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2404
+ //!
2405
+ //! * The sorting operation is given a pair of key buffers managed by a
2406
+ //! DoubleBuffer structure that indicates which of the two buffers is
2407
+ //! "current" (and thus contains the input data to be sorted).
2408
+ //! * The contents of both buffers may be altered by the sorting operation.
2409
+ //! * In-place operations are not supported. There must be no overlap between
2410
+ //! any of the provided ranges:
2411
+ //!
2412
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2413
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2414
+ //!
2415
+ //! * Upon completion, the sorting operation will update the "current"
2416
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2417
+ //! buffers now contains the sorted output sequence (a function of the
2418
+ //! number of key bits specified and the targeted device architecture).
2419
+ //! * @devicestorageP
2420
+ //! * @devicestorage
2421
+ //!
2422
+ //! Snippet
2423
+ //! --------------------------------------------------
2424
+ //!
2425
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2426
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2427
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2428
+ //! tuple of references to relevant members of the key.
2429
+ //!
2430
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2431
+ //! :language: c++
2432
+ //! :dedent:
2433
+ //! :start-after: example-begin custom-type
2434
+ //! :end-before: example-end custom-type
2435
+ //!
2436
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2437
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2438
+ //!
2439
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2440
+ //! :language: c++
2441
+ //! :dedent:
2442
+ //! :start-after: example-begin keys-db
2443
+ //! :end-before: example-end keys-db
2444
+ //!
2445
+ //! @endrst
2446
+ //!
2447
+ //! @tparam KeyT
2448
+ //! **[inferred]** KeyT type
2449
+ //!
2450
+ //! @tparam NumItemsT
2451
+ //! **[inferred]** Type of num_items
2452
+ //!
2453
+ //! @tparam DecomposerT
2454
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2455
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2456
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2457
+ //! The leftmost element of the tuple is considered the most significant.
2458
+ //! The call operator must not modify members of the key.
2459
+ //!
2460
+ //! @param[in] d_temp_storage
2461
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2462
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2463
+ //! is done.
2464
+ //!
2465
+ //! @param[in,out] temp_storage_bytes
2466
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2467
+ //!
2468
+ //! @param[in,out] d_keys
2469
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2470
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2471
+ //! point to the sorted output keys
2472
+ //!
2473
+ //! @param[in] num_items
2474
+ //! Number of items to sort
2475
+ //!
2476
+ //! @param decomposer
2477
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2478
+ //! references to its constituent arithmetic types. The leftmost element of
2479
+ //! the tuple is considered the most significant. The call operator must not
2480
+ //! modify members of the key.
2481
+ //!
2482
+ //! @param[in] stream
2483
+ //! **[optional]** CUDA stream to launch kernels within.
2484
+ //! Default is stream<sub>0</sub>.
2485
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2486
+ CUB_RUNTIME_FUNCTION static //
2487
+ ::cuda::std::enable_if_t< //
2488
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2489
+ cudaError_t>
2490
+ SortKeys(void* d_temp_storage,
2491
+ size_t& temp_storage_bytes,
2492
+ DoubleBuffer<KeyT>& d_keys,
2493
+ NumItemsT num_items,
2494
+ DecomposerT decomposer,
2495
+ cudaStream_t stream = 0)
2496
+ {
2497
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2498
+
2499
+ // unsigned integer type for global offsets
2500
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2501
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2502
+
2503
+ static_assert(decomposer_check_t::value,
2504
+ "DecomposerT must be a callable object returning a tuple of references to "
2505
+ "arithmetic types");
2506
+
2507
+ constexpr bool is_overwrite_okay = true;
2508
+ DoubleBuffer<NullType> d_values;
2509
+
2510
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2511
+ decomposer_check_t{},
2512
+ d_temp_storage,
2513
+ temp_storage_bytes,
2514
+ is_overwrite_okay,
2515
+ d_keys,
2516
+ d_values,
2517
+ static_cast<offset_t>(num_items),
2518
+ decomposer,
2519
+ stream);
2520
+ }
2521
+
2522
+ //! @rst
2523
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2524
+ //!
2525
+ //! * The sorting operation is given a pair of key buffers managed by a
2526
+ //! DoubleBuffer structure that indicates which of the two buffers is
2527
+ //! "current" (and thus contains the input data to be sorted).
2528
+ //! * The contents of both buffers may be altered by the sorting operation.
2529
+ //! * In-place operations are not supported. There must be no overlap between
2530
+ //! any of the provided ranges:
2531
+ //!
2532
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2533
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2534
+ //!
2535
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2536
+ //! differentiating key bits. This can reduce overall sorting overhead and
2537
+ //! yield a corresponding performance improvement.
2538
+ //! * Upon completion, the sorting operation will update the "current"
2539
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2540
+ //! buffers now contains the sorted output sequence (a function of the
2541
+ //! number of key bits specified and the targeted device architecture).
2542
+ //! * @devicestorageP
2543
+ //! * @devicestorage
2544
+ //!
2545
+ //! Snippet
2546
+ //! --------------------------------------------------
2547
+ //!
2548
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2549
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2550
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2551
+ //! tuple of references to relevant members of the key.
2552
+ //!
2553
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2554
+ //! :language: c++
2555
+ //! :dedent:
2556
+ //! :start-after: example-begin custom-type
2557
+ //! :end-before: example-end custom-type
2558
+ //!
2559
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2560
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2561
+ //!
2562
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2563
+ //! :language: c++
2564
+ //! :dedent:
2565
+ //! :start-after: example-begin keys-bits-db
2566
+ //! :end-before: example-end keys-bits-db
2567
+ //!
2568
+ //! @endrst
2569
+ //!
2570
+ //! @tparam KeyT
2571
+ //! **[inferred]** KeyT type
2572
+ //!
2573
+ //! @tparam NumItemsT
2574
+ //! **[inferred]** Type of num_items
2575
+ //!
2576
+ //! @tparam DecomposerT
2577
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2578
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2579
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2580
+ //! The leftmost element of the tuple is considered the most significant.
2581
+ //! The call operator must not modify members of the key.
2582
+ //!
2583
+ //! @param[in] d_temp_storage
2584
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2585
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2586
+ //! is done.
2587
+ //!
2588
+ //! @param[in,out] temp_storage_bytes
2589
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2590
+ //!
2591
+ //! @param[in,out] d_keys
2592
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2593
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2594
+ //! point to the sorted output keys
2595
+ //!
2596
+ //! @param[in] num_items
2597
+ //! Number of items to sort
2598
+ //!
2599
+ //! @param decomposer
2600
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2601
+ //! references to its constituent arithmetic types. The leftmost element of
2602
+ //! the tuple is considered the most significant. The call operator must not
2603
+ //! modify members of the key.
2604
+ //!
2605
+ //! @param[in] begin_bit
2606
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2607
+ //! key comparison
2608
+ //!
2609
+ //! @param[in] end_bit
2610
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2611
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2612
+ //!
2613
+ //! @param[in] stream
2614
+ //! **[optional]** CUDA stream to launch kernels within.
2615
+ //! Default is stream<sub>0</sub>.
2616
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2617
+ CUB_RUNTIME_FUNCTION static //
2618
+ ::cuda::std::enable_if_t< //
2619
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2620
+ cudaError_t>
2621
+ SortKeys(void* d_temp_storage,
2622
+ size_t& temp_storage_bytes,
2623
+ DoubleBuffer<KeyT>& d_keys,
2624
+ NumItemsT num_items,
2625
+ DecomposerT decomposer,
2626
+ int begin_bit,
2627
+ int end_bit,
2628
+ cudaStream_t stream = 0)
2629
+ {
2630
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2631
+
2632
+ // unsigned integer type for global offsets
2633
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2634
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2635
+
2636
+ static_assert(decomposer_check_t::value,
2637
+ "DecomposerT must be a callable object returning a tuple of references to "
2638
+ "arithmetic types");
2639
+
2640
+ constexpr bool is_overwrite_okay = true;
2641
+ DoubleBuffer<NullType> d_values;
2642
+
2643
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2644
+ decomposer_check_t{},
2645
+ d_temp_storage,
2646
+ temp_storage_bytes,
2647
+ is_overwrite_okay,
2648
+ d_keys,
2649
+ d_values,
2650
+ static_cast<offset_t>(num_items),
2651
+ decomposer,
2652
+ begin_bit,
2653
+ end_bit,
2654
+ stream);
2655
+ }
2656
+
2657
+ //! @rst Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2658
+ //!
2659
+ //! - The contents of the input data are not altered by the sorting operation.
2660
+ //! - Pointers to contiguous memory must be used; iterators are not currently
2661
+ //! supported.
2662
+ //! - In-place operations are not supported. There must be no overlap between
2663
+ //! any of the provided ranges:
2664
+ //!
2665
+ //! - ``[d_keys_in, d_keys_in + num_items)``
2666
+ //! - ``[d_keys_out, d_keys_out + num_items)``
2667
+ //!
2668
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2669
+ //! bits can be specified. This can reduce overall sorting overhead and
2670
+ //! yield a corresponding performance improvement.
2671
+ //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
2672
+ //! the sorting interface using DoubleBuffer wrappers below.
2673
+ //! - @devicestorage
2674
+ //!
2675
+ //! Snippet
2676
+ //! --------------------------------------------------
2677
+ //!
2678
+ //! The code snippet below illustrates the sorting of a device vector of
2679
+ //! ``int`` keys.
2680
+ //! @endrst
2681
+ //!
2682
+ //! @code{.cpp}
2683
+ //! #include <cub/cub.cuh>
2684
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2685
+ //!
2686
+ //! // Declare, allocate, and initialize device-accessible pointers
2687
+ //! // for sorting data
2688
+ //! int num_items; // e.g., 7
2689
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2690
+ //! int *d_keys_out; // e.g., [ ... ]
2691
+ //! ...
2692
+ //!
2693
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2694
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2695
+ //!
2696
+ //! // Determine temporary device storage requirements
2697
+ //! void *d_temp_storage = nullptr;
2698
+ //! size_t temp_storage_bytes = 0;
2699
+ //! cub::DeviceRadixSort::SortKeysDescending(
2700
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2701
+ //!
2702
+ //! // Allocate temporary storage
2703
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2704
+ //!
2705
+ //! // Run sorting operation
2706
+ //! cub::DeviceRadixSort::SortKeysDescending(
2707
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2708
+ //!
2709
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
2710
+ //! @endcode
2711
+ //!
2712
+ //! @tparam KeyT
2713
+ //! **[inferred]** KeyT type
2714
+ //!
2715
+ //! @tparam NumItemsT
2716
+ //! **[inferred]** Type of num_items
2717
+ //!
2718
+ //! @param[in] d_temp_storage
2719
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2720
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2721
+ //! is done.
2722
+ //!
2723
+ //! @param[in,out] temp_storage_bytes
2724
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2725
+ //!
2726
+ //! @param[in] d_keys_in
2727
+ //! Pointer to the input data of key data to sort
2728
+ //!
2729
+ //! @param[out] d_keys_out
2730
+ //! Pointer to the sorted output sequence of key data
2731
+ //!
2732
+ //! @param[in] num_items
2733
+ //! Number of items to sort
2734
+ //!
2735
+ //! @param[in] begin_bit
2736
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2737
+ //! key comparison
2738
+ //!
2739
+ //! @param[in] end_bit
2740
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2741
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
2742
+ //!
2743
+ //! @param[in] stream
2744
+ //! **[optional]** CUDA stream to launch kernels within.
2745
+ //! Default is stream<sub>0</sub>.
2746
+ template <typename KeyT, typename NumItemsT>
2747
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
2748
+ void* d_temp_storage,
2749
+ size_t& temp_storage_bytes,
2750
+ const KeyT* d_keys_in,
2751
+ KeyT* d_keys_out,
2752
+ NumItemsT num_items,
2753
+ int begin_bit = 0,
2754
+ int end_bit = sizeof(KeyT) * 8,
2755
+ cudaStream_t stream = 0)
2756
+ {
2757
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2758
+
2759
+ // Unsigned integer type for global offsets.
2760
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2761
+
2762
+ // We cast away const-ness, but will *not* write to these arrays.
2763
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2764
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2765
+ // is not set.
2766
+ constexpr bool is_overwrite_okay = false;
2767
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2768
+ DoubleBuffer<NullType> d_values;
2769
+
2770
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
2771
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2772
+ }
2773
+
2774
+ //! @rst
2775
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2776
+ //!
2777
+ //! * The contents of the input data are not altered by the sorting operation.
2778
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2779
+ //! supported.
2780
+ //! * In-place operations are not supported. There must be no overlap between
2781
+ //! any of the provided ranges:
2782
+ //!
2783
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2784
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2785
+ //!
2786
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2787
+ //! bits can be specified. This can reduce overall sorting overhead and
2788
+ //! yield a corresponding performance improvement.
2789
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2790
+ //! the sorting interface using DoubleBuffer wrappers below.
2791
+ //! * @devicestorage
2792
+ //!
2793
+ //! Snippet
2794
+ //! --------------------------------------------------
2795
+ //!
2796
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2797
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2798
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2799
+ //! tuple of references to relevant members of the key.
2800
+ //!
2801
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2802
+ //! :language: c++
2803
+ //! :dedent:
2804
+ //! :start-after: example-begin custom-type
2805
+ //! :end-before: example-end custom-type
2806
+ //!
2807
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2808
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2809
+ //!
2810
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2811
+ //! :language: c++
2812
+ //! :dedent:
2813
+ //! :start-after: example-begin keys-descending-bits
2814
+ //! :end-before: example-end keys-descending-bits
2815
+ //!
2816
+ //! @endrst
2817
+ //!
2818
+ //! @tparam KeyT
2819
+ //! **[inferred]** KeyT type
2820
+ //!
2821
+ //! @tparam NumItemsT
2822
+ //! **[inferred]** Type of num_items
2823
+ //!
2824
+ //! @tparam DecomposerT
2825
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2826
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2827
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2828
+ //! The leftmost element of the tuple is considered the most significant.
2829
+ //! The call operator must not modify members of the key.
2830
+ //!
2831
+ //! @param[in] d_temp_storage
2832
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2833
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2834
+ //! is done.
2835
+ //!
2836
+ //! @param[in,out] temp_storage_bytes
2837
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2838
+ //!
2839
+ //! @param[in] d_keys_in
2840
+ //! Pointer to the input data of key data to sort
2841
+ //!
2842
+ //! @param[out] d_keys_out
2843
+ //! Pointer to the sorted output sequence of key data
2844
+ //!
2845
+ //! @param[in] num_items
2846
+ //! Number of items to sort
2847
+ //!
2848
+ //! @param decomposer
2849
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2850
+ //! references to its constituent arithmetic types. The leftmost element of
2851
+ //! the tuple is considered the most significant. The call operator must not
2852
+ //! modify members of the key.
2853
+ //!
2854
+ //! @param[in] begin_bit
2855
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2856
+ //! key comparison
2857
+ //!
2858
+ //! @param[in] end_bit
2859
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2860
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
2861
+ //!
2862
+ //! @param[in] stream
2863
+ //! **[optional]** CUDA stream to launch kernels within.
2864
+ //! Default is stream<sub>0</sub>.
2865
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2866
+ CUB_RUNTIME_FUNCTION static //
2867
+ ::cuda::std::enable_if_t< //
2868
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2869
+ cudaError_t>
2870
+ SortKeysDescending(
2871
+ void* d_temp_storage,
2872
+ size_t& temp_storage_bytes,
2873
+ const KeyT* d_keys_in,
2874
+ KeyT* d_keys_out,
2875
+ NumItemsT num_items,
2876
+ DecomposerT decomposer,
2877
+ int begin_bit,
2878
+ int end_bit,
2879
+ cudaStream_t stream = 0)
2880
+ {
2881
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2882
+
2883
+ // unsigned integer type for global offsets
2884
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2885
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2886
+
2887
+ static_assert(decomposer_check_t::value,
2888
+ "DecomposerT must be a callable object returning a tuple of references to "
2889
+ "arithmetic types");
2890
+
2891
+ // We cast away const-ness, but will *not* write to these arrays.
2892
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
2893
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
2894
+ // is not set.
2895
+ constexpr bool is_overwrite_okay = false;
2896
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2897
+ DoubleBuffer<NullType> d_values;
2898
+
2899
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
2900
+ decomposer_check_t{},
2901
+ d_temp_storage,
2902
+ temp_storage_bytes,
2903
+ is_overwrite_okay,
2904
+ d_keys,
2905
+ d_values,
2906
+ static_cast<offset_t>(num_items),
2907
+ decomposer,
2908
+ begin_bit,
2909
+ end_bit,
2910
+ stream);
2911
+ }
2912
+
2913
+ //! @rst
2914
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2915
+ //!
2916
+ //! * The contents of the input data are not altered by the sorting operation.
2917
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2918
+ //! supported.
2919
+ //! * In-place operations are not supported. There must be no overlap between
2920
+ //! any of the provided ranges:
2921
+ //!
2922
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2923
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2924
+ //!
2925
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2926
+ //! the sorting interface using DoubleBuffer wrappers below.
2927
+ //! * @devicestorage
2928
+ //!
2929
+ //! Snippet
2930
+ //! --------------------------------------------------
2931
+ //!
2932
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2933
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2934
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2935
+ //! tuple of references to relevant members of the key.
2936
+ //!
2937
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2938
+ //! :language: c++
2939
+ //! :dedent:
2940
+ //! :start-after: example-begin custom-type
2941
+ //! :end-before: example-end custom-type
2942
+ //!
2943
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2944
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2945
+ //!
2946
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2947
+ //! :language: c++
2948
+ //! :dedent:
2949
+ //! :start-after: example-begin keys-descending
2950
+ //! :end-before: example-end keys-descending
2951
+ //!
2952
+ //! @endrst
2953
+ //!
2954
+ //! @tparam KeyT
2955
+ //! **[inferred]** KeyT type
2956
+ //!
2957
+ //! @tparam NumItemsT
2958
+ //! **[inferred]** Type of num_items
2959
+ //!
2960
+ //! @tparam DecomposerT
2961
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2962
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2963
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2964
+ //! The leftmost element of the tuple is considered the most significant.
2965
+ //! The call operator must not modify members of the key.
2966
+ //!
2967
+ //! @param[in] d_temp_storage
2968
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
2969
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
2970
+ //! is done.
2971
+ //!
2972
+ //! @param[in,out] temp_storage_bytes
2973
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
2974
+ //!
2975
+ //! @param[in] d_keys_in
2976
+ //! Pointer to the input data of key data to sort
2977
+ //!
2978
+ //! @param[out] d_keys_out
2979
+ //! Pointer to the sorted output sequence of key data
2980
+ //!
2981
+ //! @param[in] num_items
2982
+ //! Number of items to sort
2983
+ //!
2984
+ //! @param decomposer
2985
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2986
+ //! references to its constituent arithmetic types. The leftmost element of
2987
+ //! the tuple is considered the most significant. The call operator must not
2988
+ //! modify members of the key.
2989
+ //!
2990
+ //! @param[in] stream
2991
+ //! **[optional]** CUDA stream to launch kernels within.
2992
+ //! Default is stream<sub>0</sub>.
2993
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2994
+ CUB_RUNTIME_FUNCTION static //
2995
+ ::cuda::std::enable_if_t< //
2996
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2997
+ cudaError_t>
2998
+ SortKeysDescending(
2999
+ void* d_temp_storage,
3000
+ size_t& temp_storage_bytes,
3001
+ const KeyT* d_keys_in,
3002
+ KeyT* d_keys_out,
3003
+ NumItemsT num_items,
3004
+ DecomposerT decomposer,
3005
+ cudaStream_t stream = 0)
3006
+ {
3007
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3008
+
3009
+ // unsigned integer type for global offsets
3010
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3011
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3012
+
3013
+ static_assert(decomposer_check_t::value,
3014
+ "DecomposerT must be a callable object returning a tuple of references to "
3015
+ "arithmetic types");
3016
+
3017
+ // We cast away const-ness, but will *not* write to these arrays.
3018
+ // ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
3019
+ // create a new double-buffer internally when the ``is_overwrite_ok`` flag
3020
+ // is not set.
3021
+ constexpr bool is_overwrite_okay = false;
3022
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
3023
+ DoubleBuffer<NullType> d_values;
3024
+
3025
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3026
+ decomposer_check_t{},
3027
+ d_temp_storage,
3028
+ temp_storage_bytes,
3029
+ is_overwrite_okay,
3030
+ d_keys,
3031
+ d_values,
3032
+ static_cast<offset_t>(num_items),
3033
+ decomposer,
3034
+ stream);
3035
+ }
3036
+
3037
+ //! @rst
3038
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3039
+ //!
3040
+ //! - The sorting operation is given a pair of key buffers managed by a
3041
+ //! DoubleBuffer structure that indicates which of the two buffers is
3042
+ //! "current" (and thus contains the input data to be sorted).
3043
+ //! - The contents of both buffers may be altered by the sorting operation.
3044
+ //! - In-place operations are not supported. There must be no overlap between
3045
+ //! any of the provided ranges:
3046
+ //!
3047
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
3048
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3049
+ //!
3050
+ //! - Upon completion, the sorting operation will update the "current"
3051
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3052
+ //! buffers now contains the sorted output sequence (a function of the
3053
+ //! number of key bits specified and the targeted device architecture).
3054
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
3055
+ //! bits can be specified. This can reduce overall sorting overhead and
3056
+ //! yield a corresponding performance improvement.
3057
+ //! - @devicestorageP
3058
+ //! - @devicestorage
3059
+ //!
3060
+ //! Snippet
3061
+ //! --------------------------------------------------
3062
+ //!
3063
+ //! The code snippet below illustrates the sorting of a device vector of ``int`` keys.
3064
+ //! @endrst
3065
+ //!
3066
+ //! @code{.cpp}
3067
+ //! #include <cub/cub.cuh>
3068
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
3069
+ //!
3070
+ //! // Declare, allocate, and initialize device-accessible pointers
3071
+ //! // for sorting data
3072
+ //! int num_items; // e.g., 7
3073
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
3074
+ //! int *d_key_alt_buf; // e.g., [ ... ]
3075
+ //! ...
3076
+ //!
3077
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
3078
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
3079
+ //!
3080
+ //! // Determine temporary device storage requirements
3081
+ //! void *d_temp_storage = nullptr;
3082
+ //! size_t temp_storage_bytes = 0;
3083
+ //! cub::DeviceRadixSort::SortKeysDescending(
3084
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3085
+ //!
3086
+ //! // Allocate temporary storage
3087
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
3088
+ //!
3089
+ //! // Run sorting operation
3090
+ //! cub::DeviceRadixSort::SortKeysDescending(
3091
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3092
+ //!
3093
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
3094
+ //! @endcode
3095
+ //!
3096
+ //! @tparam KeyT
3097
+ //! **[inferred]** KeyT type
3098
+ //!
3099
+ //! @tparam NumItemsT
3100
+ //! **[inferred]** Type of num_items
3101
+ //!
3102
+ //! @param[in] d_temp_storage
3103
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3104
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3105
+ //! is done.
3106
+ //!
3107
+ //! @param[in,out] temp_storage_bytes
3108
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3109
+ //!
3110
+ //! @param[in,out] d_keys
3111
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3112
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3113
+ //! point to the sorted output keys
3114
+ //!
3115
+ //! @param[in] num_items
3116
+ //! Number of items to sort
3117
+ //!
3118
+ //! @param[in] begin_bit
3119
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3120
+ //! key comparison
3121
+ //!
3122
+ //! @param[in] end_bit
3123
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3124
+ //! comparison (e.g., ``sizeof(unsigned int) * 8``)
3125
+ //!
3126
+ //! @param[in] stream
3127
+ //! **[optional]** CUDA stream to launch kernels within.
3128
+ //! Default is stream<sub>0</sub>.
3129
+ template <typename KeyT, typename NumItemsT>
3130
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
3131
+ void* d_temp_storage,
3132
+ size_t& temp_storage_bytes,
3133
+ DoubleBuffer<KeyT>& d_keys,
3134
+ NumItemsT num_items,
3135
+ int begin_bit = 0,
3136
+ int end_bit = sizeof(KeyT) * 8,
3137
+ cudaStream_t stream = 0)
3138
+ {
3139
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3140
+
3141
+ // Unsigned integer type for global offsets.
3142
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
3143
+
3144
+ constexpr bool is_overwrite_okay = true;
3145
+
3146
+ // Null value type
3147
+ DoubleBuffer<NullType> d_values;
3148
+
3149
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
3150
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
3151
+ }
3152
+
3153
+ //! @rst
3154
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3155
+ //!
3156
+ //! * The sorting operation is given a pair of key buffers managed by a
3157
+ //! DoubleBuffer structure that indicates which of the two buffers is
3158
+ //! "current" (and thus contains the input data to be sorted).
3159
+ //! * The contents of both buffers may be altered by the sorting operation.
3160
+ //! * In-place operations are not supported. There must be no overlap between
3161
+ //! any of the provided ranges:
3162
+ //!
3163
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3164
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3165
+ //!
3166
+ //! * Upon completion, the sorting operation will update the "current"
3167
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3168
+ //! buffers now contains the sorted output sequence (a function of the
3169
+ //! number of key bits specified and the targeted device architecture).
3170
+ //! * @devicestorageP
3171
+ //! * @devicestorage
3172
+ //!
3173
+ //! Snippet
3174
+ //! --------------------------------------------------
3175
+ //!
3176
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3177
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3178
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3179
+ //! tuple of references to relevant members of the key.
3180
+ //!
3181
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3182
+ //! :language: c++
3183
+ //! :dedent:
3184
+ //! :start-after: example-begin custom-type
3185
+ //! :end-before: example-end custom-type
3186
+ //!
3187
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3188
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3189
+ //!
3190
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3191
+ //! :language: c++
3192
+ //! :dedent:
3193
+ //! :start-after: example-begin keys-descending-db
3194
+ //! :end-before: example-end keys-descending-db
3195
+ //!
3196
+ //! @endrst
3197
+ //!
3198
+ //! @tparam KeyT
3199
+ //! **[inferred]** KeyT type
3200
+ //!
3201
+ //! @tparam NumItemsT
3202
+ //! **[inferred]** Type of num_items
3203
+ //!
3204
+ //! @tparam DecomposerT
3205
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3206
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3207
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3208
+ //! The leftmost element of the tuple is considered the most significant.
3209
+ //! The call operator must not modify members of the key.
3210
+ //!
3211
+ //! @param[in] d_temp_storage
3212
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3213
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3214
+ //! is done.
3215
+ //!
3216
+ //! @param[in,out] temp_storage_bytes
3217
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3218
+ //!
3219
+ //! @param[in,out] d_keys
3220
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3221
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3222
+ //! point to the sorted output keys
3223
+ //!
3224
+ //! @param[in] num_items
3225
+ //! Number of items to sort
3226
+ //!
3227
+ //! @param decomposer
3228
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3229
+ //! references to its constituent arithmetic types. The leftmost element of
3230
+ //! the tuple is considered the most significant. The call operator must not
3231
+ //! modify members of the key.
3232
+ //!
3233
+ //! @param[in] stream
3234
+ //! **[optional]** CUDA stream to launch kernels within.
3235
+ //! Default is stream<sub>0</sub>.
3236
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3237
+ CUB_RUNTIME_FUNCTION static //
3238
+ ::cuda::std::enable_if_t< //
3239
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3240
+ cudaError_t>
3241
+ SortKeysDescending(
3242
+ void* d_temp_storage,
3243
+ size_t& temp_storage_bytes,
3244
+ DoubleBuffer<KeyT>& d_keys,
3245
+ NumItemsT num_items,
3246
+ DecomposerT decomposer,
3247
+ cudaStream_t stream = 0)
3248
+ {
3249
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3250
+
3251
+ // unsigned integer type for global offsets
3252
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3253
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3254
+
3255
+ static_assert(decomposer_check_t::value,
3256
+ "DecomposerT must be a callable object returning a tuple of references to "
3257
+ "arithmetic types");
3258
+
3259
+ constexpr bool is_overwrite_okay = true;
3260
+ DoubleBuffer<NullType> d_values;
3261
+
3262
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3263
+ decomposer_check_t{},
3264
+ d_temp_storage,
3265
+ temp_storage_bytes,
3266
+ is_overwrite_okay,
3267
+ d_keys,
3268
+ d_values,
3269
+ static_cast<offset_t>(num_items),
3270
+ decomposer,
3271
+ stream);
3272
+ }
3273
+
3274
+ //! @rst
3275
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3276
+ //!
3277
+ //! * The sorting operation is given a pair of key buffers managed by a
3278
+ //! DoubleBuffer structure that indicates which of the two buffers is
3279
+ //! "current" (and thus contains the input data to be sorted).
3280
+ //! * The contents of both buffers may be altered by the sorting operation.
3281
+ //! * In-place operations are not supported. There must be no overlap between
3282
+ //! any of the provided ranges:
3283
+ //!
3284
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3285
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3286
+ //!
3287
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
3288
+ //! differentiating key bits. This can reduce overall sorting overhead and
3289
+ //! yield a corresponding performance improvement.
3290
+ //! * Upon completion, the sorting operation will update the "current"
3291
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3292
+ //! buffers now contains the sorted output sequence (a function of the
3293
+ //! number of key bits specified and the targeted device architecture).
3294
+ //! * @devicestorageP
3295
+ //! * @devicestorage
3296
+ //!
3297
+ //! Snippet
3298
+ //! --------------------------------------------------
3299
+ //!
3300
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3301
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3302
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3303
+ //! tuple of references to relevant members of the key.
3304
+ //!
3305
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3306
+ //! :language: c++
3307
+ //! :dedent:
3308
+ //! :start-after: example-begin custom-type
3309
+ //! :end-before: example-end custom-type
3310
+ //!
3311
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3312
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3313
+ //!
3314
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3315
+ //! :language: c++
3316
+ //! :dedent:
3317
+ //! :start-after: example-begin keys-descending-bits-db
3318
+ //! :end-before: example-end keys-descending-bits-db
3319
+ //!
3320
+ //! @endrst
3321
+ //!
3322
+ //! @tparam KeyT
3323
+ //! **[inferred]** KeyT type
3324
+ //!
3325
+ //! @tparam NumItemsT
3326
+ //! **[inferred]** Type of num_items
3327
+ //!
3328
+ //! @tparam DecomposerT
3329
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3330
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3331
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3332
+ //! The leftmost element of the tuple is considered the most significant.
3333
+ //! The call operator must not modify members of the key.
3334
+ //!
3335
+ //! @param[in] d_temp_storage
3336
+ //! Device-accessible allocation of temporary storage. When ``nullptr``, the
3337
+ //! required allocation size is written to ``temp_storage_bytes`` and no work
3338
+ //! is done.
3339
+ //!
3340
+ //! @param[in,out] temp_storage_bytes
3341
+ //! Reference to size in bytes of ``d_temp_storage`` allocation
3342
+ //!
3343
+ //! @param[in,out] d_keys
3344
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3345
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3346
+ //! point to the sorted output keys
3347
+ //!
3348
+ //! @param[in] num_items
3349
+ //! Number of items to sort
3350
+ //!
3351
+ //! @param decomposer
3352
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3353
+ //! references to its constituent arithmetic types. The leftmost element of
3354
+ //! the tuple is considered the most significant. The call operator must not
3355
+ //! modify members of the key.
3356
+ //!
3357
+ //! @param[in] begin_bit
3358
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3359
+ //! key comparison
3360
+ //!
3361
+ //! @param[in] end_bit
3362
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3363
+ //! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
3364
+ //!
3365
+ //! @param[in] stream
3366
+ //! **[optional]** CUDA stream to launch kernels within.
3367
+ //! Default is stream<sub>0</sub>.
3368
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3369
+ CUB_RUNTIME_FUNCTION static //
3370
+ ::cuda::std::enable_if_t< //
3371
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3372
+ cudaError_t>
3373
+ SortKeysDescending(
3374
+ void* d_temp_storage,
3375
+ size_t& temp_storage_bytes,
3376
+ DoubleBuffer<KeyT>& d_keys,
3377
+ NumItemsT num_items,
3378
+ DecomposerT decomposer,
3379
+ int begin_bit,
3380
+ int end_bit,
3381
+ cudaStream_t stream = 0)
3382
+ {
3383
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3384
+
3385
+ // unsigned integer type for global offsets
3386
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3387
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3388
+
3389
+ static_assert(decomposer_check_t::value,
3390
+ "DecomposerT must be a callable object returning a tuple of references to "
3391
+ "arithmetic types");
3392
+
3393
+ constexpr bool is_overwrite_okay = true;
3394
+ DoubleBuffer<NullType> d_values;
3395
+
3396
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3397
+ decomposer_check_t{},
3398
+ d_temp_storage,
3399
+ temp_storage_bytes,
3400
+ is_overwrite_okay,
3401
+ d_keys,
3402
+ d_values,
3403
+ static_cast<offset_t>(num_items),
3404
+ decomposer,
3405
+ begin_bit,
3406
+ end_bit,
3407
+ stream);
3408
+ }
3409
+
3410
+ //! @} end member group
3411
+ };
3412
+
3413
+ CUB_NAMESPACE_END