cuda-cccl 0.1.3.1.0.dev1486__cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1819) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +276 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +953 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
  241. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
  242. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  243. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
  244. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  245. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  247. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  248. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  249. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  259. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  260. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  261. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
  262. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  263. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  264. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  265. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  266. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  267. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  268. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  269. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  270. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  271. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  272. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  273. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  274. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  275. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  276. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  377. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  378. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  379. cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
  380. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
  381. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  382. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  383. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
  384. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  385. cuda/cccl/headers/include/cuda/access_property +26 -0
  386. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  387. cuda/cccl/headers/include/cuda/atomic +27 -0
  388. cuda/cccl/headers/include/cuda/barrier +262 -0
  389. cuda/cccl/headers/include/cuda/bit +29 -0
  390. cuda/cccl/headers/include/cuda/cmath +35 -0
  391. cuda/cccl/headers/include/cuda/discard_memory +61 -0
  392. cuda/cccl/headers/include/cuda/functional +31 -0
  393. cuda/cccl/headers/include/cuda/iterator +31 -0
  394. cuda/cccl/headers/include/cuda/latch +27 -0
  395. cuda/cccl/headers/include/cuda/mdspan +28 -0
  396. cuda/cccl/headers/include/cuda/memory +28 -0
  397. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  398. cuda/cccl/headers/include/cuda/numeric +28 -0
  399. cuda/cccl/headers/include/cuda/pipeline +579 -0
  400. cuda/cccl/headers/include/cuda/ptx +118 -0
  401. cuda/cccl/headers/include/cuda/semaphore +31 -0
  402. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
  403. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
  404. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
  405. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  406. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  407. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  408. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  409. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  495. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  496. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  497. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  498. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  499. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  500. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  501. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  502. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  517. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  518. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  519. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
  520. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  521. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  522. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  523. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  524. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  525. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  526. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  527. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  530. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  531. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
  532. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  533. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
  534. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
  535. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  536. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  537. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
  555. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  556. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  557. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  558. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  559. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  560. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
  561. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
  562. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
  563. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
  564. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
  565. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
  566. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
  567. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
  583. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
  584. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
  585. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
  586. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  587. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  588. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  589. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  590. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  591. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  592. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  593. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  594. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  595. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  596. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  597. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  598. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  599. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  600. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  601. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  602. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  603. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  605. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  606. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  607. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  608. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  609. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  610. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  611. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  612. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  613. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  614. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  615. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  616. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  617. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
  618. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  619. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  620. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  621. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
  622. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
  623. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  624. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
  625. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  626. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  627. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  628. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  629. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  630. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  631. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  632. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  633. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  634. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  635. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  637. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  638. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  639. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  640. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  641. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  642. cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
  643. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
  644. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  645. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
  646. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
  647. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  648. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  649. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  650. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  651. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  652. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  653. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  654. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  655. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
  656. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
  657. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  658. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  659. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  660. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  661. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  662. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  663. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  664. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
  665. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
  667. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  668. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  670. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  671. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  672. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  673. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  674. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  675. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  676. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  677. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  678. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  679. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  680. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  681. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  682. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  683. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  684. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  685. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  686. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  687. cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
  688. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
  689. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
  690. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
  691. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
  692. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
  693. cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
  694. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  696. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
  697. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  698. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
  699. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  700. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  701. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
  702. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
  703. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  704. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  705. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  706. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
  708. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  709. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
  710. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  711. cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
  712. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
  713. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
  724. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  725. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  726. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  727. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  728. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
  729. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
  731. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
  732. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
  733. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  734. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
  735. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
  736. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
  737. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
  738. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
  739. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
  740. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  741. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  742. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
  743. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
  744. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  745. cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
  746. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  747. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  748. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  749. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  751. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
  752. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  753. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  754. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
  755. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  756. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  757. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  758. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
  759. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
  761. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
  762. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  763. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  764. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  765. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  766. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  767. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  768. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  769. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
  770. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  771. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  772. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  773. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  774. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  775. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  776. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  777. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
  778. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
  779. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  780. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  781. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  782. cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
  783. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  784. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  785. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  786. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  787. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  788. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
  789. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  790. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  791. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  792. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  793. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
  794. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  795. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  796. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
  797. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  798. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  799. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  800. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
  801. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
  802. cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
  803. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  804. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
  805. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
  806. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  808. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  809. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  810. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  811. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  812. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
  813. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
  814. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  815. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  816. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  817. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  818. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  819. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  820. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  821. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
  822. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  823. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  824. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  825. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
  826. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  827. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  828. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  829. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  830. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  831. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  832. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  833. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
  834. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  835. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  836. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  837. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  838. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  839. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  840. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  841. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  842. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  843. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  844. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  845. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  846. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  847. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  848. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  849. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  850. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  851. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  852. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  853. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  854. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  855. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  856. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  857. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  858. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  859. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  860. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  861. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  862. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  863. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  864. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  865. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  866. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  867. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  868. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  869. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  870. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  871. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  872. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  873. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  874. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  875. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  876. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  973. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  974. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  975. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  976. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
  977. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  978. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  979. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  980. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  981. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
  982. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  983. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  984. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  985. cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
  986. cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
  987. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  988. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
  989. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  990. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  991. cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
  992. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  993. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
  994. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  995. cuda/cccl/headers/include/cuda/std/array +527 -0
  996. cuda/cccl/headers/include/cuda/std/atomic +823 -0
  997. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  998. cuda/cccl/headers/include/cuda/std/bit +35 -0
  999. cuda/cccl/headers/include/cuda/std/bitset +1026 -0
  1000. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1001. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1002. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1003. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1004. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1005. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1006. cuda/cccl/headers/include/cuda/std/complex +25 -0
  1007. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1008. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1009. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1010. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1011. cuda/cccl/headers/include/cuda/std/cstring +111 -0
  1012. cuda/cccl/headers/include/cuda/std/ctime +147 -0
  1013. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1014. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
  1015. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
  1016. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
  1017. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
  1018. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
  1019. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1020. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
  1021. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1022. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
  1023. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
  1024. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1025. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1026. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1027. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1028. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1029. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1030. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1031. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1032. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1033. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1034. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1035. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1036. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1037. cuda/cccl/headers/include/cuda/std/optional +25 -0
  1038. cuda/cccl/headers/include/cuda/std/ranges +68 -0
  1039. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1040. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1041. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1042. cuda/cccl/headers/include/cuda/std/span +640 -0
  1043. cuda/cccl/headers/include/cuda/std/string_view +814 -0
  1044. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1045. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1046. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1047. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1048. cuda/cccl/headers/include/cuda/std/version +245 -0
  1049. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1050. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1051. cuda/cccl/headers/include/cuda/version +16 -0
  1052. cuda/cccl/headers/include/cuda/warp +28 -0
  1053. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1054. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1055. cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
  1056. cuda/cccl/headers/include/nv/target +229 -0
  1057. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1058. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1059. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1060. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1061. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1062. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1063. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1064. cuda/cccl/headers/include/thrust/count.h +245 -0
  1065. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1066. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1067. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1068. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1069. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1070. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1071. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1072. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1073. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1074. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1075. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1076. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1077. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1078. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1079. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1080. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1081. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1082. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1083. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1084. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1085. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1086. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1087. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1088. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1089. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1090. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1091. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1092. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1093. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1094. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1095. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1096. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1097. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1098. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1099. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1100. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1101. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1102. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1103. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1104. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1105. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1106. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1107. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1108. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1109. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1110. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1111. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1112. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1113. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1114. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1115. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1116. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1117. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1118. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1119. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1120. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1121. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1122. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1123. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1124. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1125. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1126. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1127. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1128. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1129. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1130. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1131. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1132. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1133. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1134. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1135. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1136. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1137. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1138. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1139. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1140. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1141. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1142. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1143. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1144. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1145. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1146. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1147. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1148. cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
  1149. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1150. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
  1151. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1152. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1153. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1154. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1155. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1156. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1157. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1158. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1159. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1160. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1161. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1162. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1163. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1164. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1165. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1166. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1167. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1168. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1169. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1170. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1171. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1172. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1173. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1174. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1175. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1176. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1177. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1178. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1179. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1180. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1181. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1182. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1183. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1184. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1185. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1186. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
  1187. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1188. cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
  1189. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1190. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1191. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
  1192. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1193. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1194. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1195. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1196. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1197. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1198. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1199. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
  1200. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1201. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1202. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1203. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1204. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1205. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1206. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1207. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1208. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1209. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1210. cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
  1211. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
  1212. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1213. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1214. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1215. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1216. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1217. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1218. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1219. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1220. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1221. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1222. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1223. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1224. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1225. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1226. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1227. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1228. cuda/cccl/headers/include/thrust/find.h +382 -0
  1229. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1230. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1231. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1232. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1233. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1234. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1235. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1236. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1237. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1238. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1239. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1240. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1241. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1242. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1243. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1244. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1245. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1246. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1247. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1248. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1249. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1250. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1251. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1252. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
  1253. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1254. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1255. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1256. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
  1257. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1258. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1259. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1260. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1261. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1262. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1263. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1264. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1265. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1266. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1267. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1268. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1269. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1270. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1271. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1272. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1273. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1274. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1275. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1276. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1277. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1278. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1279. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1280. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1281. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1282. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1283. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1284. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1285. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1286. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1287. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1288. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1289. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1290. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1291. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1292. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1293. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1294. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1295. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1296. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1297. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1298. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1299. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1300. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1301. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1302. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1303. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1304. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1305. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1306. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1307. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1308. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1309. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1310. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1311. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1312. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1313. cuda/cccl/headers/include/thrust/random.h +120 -0
  1314. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1315. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1316. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1317. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1318. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1319. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1320. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1321. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1322. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1323. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1324. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1325. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1326. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1327. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1328. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1329. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1330. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1331. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1332. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1333. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1334. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1335. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1336. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1337. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1338. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1339. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1340. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1341. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1342. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1343. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1344. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1345. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1346. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1347. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1348. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1349. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1350. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1351. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1352. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1353. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1354. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1355. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1356. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1357. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1358. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1359. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1360. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1361. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1362. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1363. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1364. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1365. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1366. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1377. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1378. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1379. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1380. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1382. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1383. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1384. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1385. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1386. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1388. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
  1389. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1390. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1391. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1392. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1393. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1394. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1395. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1396. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1397. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1398. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1399. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1400. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1401. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1402. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1403. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1404. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
  1405. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1406. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1408. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1409. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1410. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1411. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1412. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1413. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1414. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1415. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1416. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1417. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
  1418. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1446. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1447. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1448. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1449. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1450. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1451. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1452. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1453. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1454. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1455. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1456. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1457. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1458. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1459. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1460. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1461. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1462. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1463. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1464. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1465. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1466. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1467. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1468. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1469. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1470. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1471. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1472. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1473. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1474. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1475. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1476. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1477. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1478. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1479. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1480. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1481. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1482. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1483. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1484. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1485. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1486. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1487. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1635. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1636. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1637. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1638. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1639. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1640. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1641. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1642. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1643. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1644. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1645. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1646. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1647. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1648. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1649. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1650. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1651. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1652. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1653. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1654. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1655. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1656. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1657. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1658. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1659. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1660. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1661. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1662. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1663. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1664. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1665. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1666. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1668. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1669. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1670. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1671. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1672. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1673. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1674. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1675. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1702. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1703. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1704. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1705. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1706. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1707. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1708. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1709. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1710. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1711. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1712. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1713. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1714. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1715. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1716. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1717. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1718. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1724. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1725. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1726. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1727. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1728. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1730. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1731. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1732. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1734. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1735. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1736. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1737. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1738. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1739. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1740. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1741. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1742. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1767. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1768. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1769. cuda/cccl/headers/include/thrust/transform.h +903 -0
  1770. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1771. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1772. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1773. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1774. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
  1775. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1776. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1777. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1778. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
  1779. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1780. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
  1781. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1782. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1783. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1784. cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
  1785. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1786. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1787. cuda/cccl/headers/include/thrust/version.h +93 -0
  1788. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1789. cuda/cccl/headers/include_paths.py +72 -0
  1790. cuda/cccl/parallel/__init__.py +3 -0
  1791. cuda/cccl/parallel/experimental/__init__.py +3 -0
  1792. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1793. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1794. cuda/cccl/parallel/experimental/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  1795. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1796. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1797. cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
  1798. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1799. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1800. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1801. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1802. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1803. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1804. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1805. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1806. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1807. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1808. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1809. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1810. cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
  1811. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1812. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1813. cuda/cccl/parallel/experimental/struct.py +150 -0
  1814. cuda/cccl/parallel/experimental/typing.py +27 -0
  1815. cuda/cccl/py.typed +0 -0
  1816. cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
  1817. cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
  1818. cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
  1819. cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,3431 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ //! @file
30
+ //! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
31
+ //! items residing within device-accessible memory.
32
+
33
+ #pragma once
34
+
35
+ #include <cub/config.cuh>
36
+
37
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
38
+ # pragma GCC system_header
39
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
40
+ # pragma clang system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
42
+ # pragma system_header
43
+ #endif // no system header
44
+
45
+ #include <cub/detail/choose_offset.cuh>
46
+ #include <cub/device/dispatch/dispatch_radix_sort.cuh>
47
+
48
+ #include <cuda/std/type_traits>
49
+
50
+ CUB_NAMESPACE_BEGIN
51
+
52
+ //! @brief DeviceRadixSort provides device-wide, parallel operations for
53
+ //! computing a radix sort across a sequence of data items residing
54
+ //! within device-accessible memory. ![](sorting_logo.png)
55
+ //!
56
+ //! @par Overview
57
+ //! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
58
+ //! arranges items into ascending (or descending) order. The algorithm relies
59
+ //! upon a positional representation for keys, i.e., each key is comprised of an
60
+ //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
61
+ //! least-significant to most-significant. For a given input sequence of keys
62
+ //! and a set of rules specifying a total ordering of the symbolic alphabet, the
63
+ //! radix sorting method produces a lexicographic ordering of those keys.
64
+ //!
65
+ //! @par Supported Types
66
+ //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
67
+ //! (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half`
68
+ //! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are
69
+ //! supported as long as decomposer object is provided.
70
+ //!
71
+ //! @par Floating-Point Special Cases
72
+ //!
73
+ //! - Positive and negative zeros are considered equivalent, and will be treated
74
+ //! as such in the output.
75
+ //! - No special handling is implemented for NaN values; these are sorted
76
+ //! according to their bit representations after any transformations.
77
+ //!
78
+ //! @par Transformations
79
+ //! Although the direct radix sorting method can only be applied to unsigned
80
+ //! integral types, DeviceRadixSort is able to sort signed and floating-point
81
+ //! types via simple bit-wise transformations that ensure lexicographic key
82
+ //! ordering. Additional transformations occur for descending sorts. These
83
+ //! transformations must be considered when restricting the
84
+ //! `[begin_bit, end_bit)` range, as the bitwise transformations will occur
85
+ //! before the bit-range truncation.
86
+ //!
87
+ //! Any transformations applied to the keys prior to sorting are reversed
88
+ //! while writing to the final output buffer.
89
+ //!
90
+ //! @par Type Specific Bitwise Transformations
91
+ //! To convert the input values into a radix-sortable bitwise representation,
92
+ //! the following transformations take place prior to sorting:
93
+ //!
94
+ //! - For unsigned integral values, the keys are used directly.
95
+ //! - For signed integral values, the sign bit is inverted.
96
+ //! - For positive floating point values, the sign bit is inverted.
97
+ //! - For negative floating point values, the full key is inverted.
98
+ //!
99
+ //! For floating point types, positive and negative zero are a special case and
100
+ //! will be considered equivalent during sorting.
101
+ //!
102
+ //! @par Descending Sort Bitwise Transformations
103
+ //! If descending sort is used, the keys are inverted after performing any
104
+ //! type-specific transformations, and the resulting keys are sorted in ascending
105
+ //! order.
106
+ //!
107
+ //! @par Stability
108
+ //! DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are
109
+ //! considered equal and appear in the result in the same order as they appear in
110
+ //! the input.
111
+ //!
112
+ //! @par Usage Considerations
113
+ //! @cdp_class{DeviceRadixSort}
114
+ //!
115
+ //! @par Performance
116
+ //! @linear_performance{radix sort} The following chart illustrates
117
+ //! DeviceRadixSort::SortKeys performance across different CUDA architectures
118
+ //! for uniform-random `uint32` keys.
119
+ //! @plots_below
120
+ //!
121
+ //! @image html lsb_radix_sort_int32_keys.png
122
+ struct DeviceRadixSort
123
+ {
124
+ private:
125
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
126
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
127
+ ::cuda::std::false_type,
128
+ void* d_temp_storage,
129
+ size_t& temp_storage_bytes,
130
+ bool is_overwrite_okay,
131
+ DoubleBuffer<KeyT>& d_keys,
132
+ DoubleBuffer<ValueT>& d_values,
133
+ NumItemsT num_items,
134
+ DecomposerT decomposer,
135
+ int begin_bit,
136
+ int end_bit,
137
+ cudaStream_t stream);
138
+
139
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
140
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
141
+ ::cuda::std::true_type,
142
+ void* d_temp_storage,
143
+ size_t& temp_storage_bytes,
144
+ bool is_overwrite_okay,
145
+ DoubleBuffer<KeyT>& d_keys,
146
+ DoubleBuffer<ValueT>& d_values,
147
+ OffsetT num_items,
148
+ DecomposerT decomposer,
149
+ int begin_bit,
150
+ int end_bit,
151
+ cudaStream_t stream)
152
+ {
153
+ return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, DecomposerT>::Dispatch(
154
+ d_temp_storage,
155
+ temp_storage_bytes,
156
+ d_keys,
157
+ d_values,
158
+ static_cast<OffsetT>(num_items),
159
+ begin_bit,
160
+ end_bit,
161
+ is_overwrite_okay,
162
+ stream,
163
+ decomposer);
164
+ }
165
+
166
+ template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
167
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
168
+ ::cuda::std::false_type,
169
+ void* d_temp_storage,
170
+ size_t& temp_storage_bytes,
171
+ bool is_overwrite_okay,
172
+ DoubleBuffer<KeyT>& d_keys,
173
+ DoubleBuffer<ValueT>& d_values,
174
+ NumItemsT num_items,
175
+ DecomposerT decomposer,
176
+ cudaStream_t stream);
177
+
178
+ template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
179
+ CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
180
+ ::cuda::std::true_type,
181
+ void* d_temp_storage,
182
+ size_t& temp_storage_bytes,
183
+ bool is_overwrite_okay,
184
+ DoubleBuffer<KeyT>& d_keys,
185
+ DoubleBuffer<ValueT>& d_values,
186
+ OffsetT num_items,
187
+ DecomposerT decomposer,
188
+ cudaStream_t stream)
189
+ {
190
+ constexpr int begin_bit = 0;
191
+ const int end_bit = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
192
+
193
+ return DeviceRadixSort::custom_radix_sort<Order>(
194
+ ::cuda::std::true_type{},
195
+ d_temp_storage,
196
+ temp_storage_bytes,
197
+ is_overwrite_okay,
198
+ d_keys,
199
+ d_values,
200
+ num_items,
201
+ decomposer,
202
+ begin_bit,
203
+ end_bit,
204
+ stream);
205
+ }
206
+
207
+ // Name reported for NVTX ranges
208
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
209
+ {
210
+ return "cub::DeviceRadixSort";
211
+ }
212
+
213
+ public:
214
+ //! @name KeyT-value pairs
215
+ //@{
216
+
217
+ //! @brief Sorts key-value pairs into ascending order.
218
+ //! (`~2N` auxiliary storage required)
219
+ //!
220
+ //! @par
221
+ //! - The contents of the input data are not altered by the sorting operation.
222
+ //! - Pointers to contiguous memory must be used; iterators are not currently
223
+ //! supported.
224
+ //! - In-place operations are not supported. There must be no overlap between
225
+ //! any of the provided ranges:
226
+ //! - `[d_keys_in, d_keys_in + num_items)`
227
+ //! - `[d_keys_out, d_keys_out + num_items)`
228
+ //! - `[d_values_in, d_values_in + num_items)`
229
+ //! - `[d_values_out, d_values_out + num_items)`
230
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
231
+ //! bits can be specified. This can reduce overall sorting overhead and
232
+ //! yield a corresponding performance improvement.
233
+ //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
234
+ //! the sorting interface using DoubleBuffer wrappers below.
235
+ //! - @devicestorage
236
+ //!
237
+ //! @par Performance
238
+ //! The following charts illustrate saturated sorting performance across
239
+ //! different CUDA architectures for uniform-random `uint32, uint32` and
240
+ //! `uint64, uint64` pairs, respectively.
241
+ //!
242
+ //! @image html lsb_radix_sort_int32_pairs.png
243
+ //! @image html lsb_radix_sort_int64_pairs.png
244
+ //!
245
+ //! @par Snippet
246
+ //! The code snippet below illustrates the sorting of a device vector of `int`
247
+ //! keys with associated vector of `int` values.
248
+ //! @par
249
+ //! @code
250
+ //! #include <cub/cub.cuh>
251
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
252
+ //!
253
+ //! // Declare, allocate, and initialize device-accessible pointers
254
+ //! // for sorting data
255
+ //! int num_items; // e.g., 7
256
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
257
+ //! int *d_keys_out; // e.g., [ ... ]
258
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
259
+ //! int *d_values_out; // e.g., [ ... ]
260
+ //! ...
261
+ //!
262
+ //! // Determine temporary device storage requirements
263
+ //! void *d_temp_storage = nullptr;
264
+ //! size_t temp_storage_bytes = 0;
265
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
266
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
267
+ //!
268
+ //! // Allocate temporary storage
269
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
270
+ //!
271
+ //! // Run sorting operation
272
+ //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
273
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
274
+ //!
275
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
276
+ //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
277
+ //! @endcode
278
+ //!
279
+ //! @tparam KeyT
280
+ //! **[inferred]** KeyT type
281
+ //!
282
+ //! @tparam ValueT
283
+ //! **[inferred]** ValueT type
284
+ //!
285
+ //! @tparam NumItemsT
286
+ //! **[inferred]** Type of num_items
287
+ //!
288
+ //! @param[in] d_temp_storage
289
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
290
+ //! required allocation size is written to `temp_storage_bytes` and no work
291
+ //! is done.
292
+ //!
293
+ //! @param[in,out] temp_storage_bytes
294
+ //! Reference to size in bytes of `d_temp_storage` allocation
295
+ //!
296
+ //! @param[in] d_keys_in
297
+ //! Pointer to the input data of key data to sort
298
+ //!
299
+ //! @param[out] d_keys_out
300
+ //! Pointer to the sorted output sequence of key data
301
+ //!
302
+ //! @param[in] d_values_in
303
+ //! Pointer to the corresponding input sequence of associated value items
304
+ //!
305
+ //! @param[out] d_values_out
306
+ //! Pointer to the correspondingly-reordered output sequence of associated
307
+ //! value items
308
+ //!
309
+ //! @param[in] num_items
310
+ //! Number of items to sort
311
+ //!
312
+ //! @param[in] begin_bit
313
+ //! **[optional]** The least-significant bit index (inclusive) needed for
314
+ //! key comparison
315
+ //!
316
+ //! @param[in] end_bit
317
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
318
+ //! comparison (e.g., sizeof(unsigned int) * 8)
319
+ //!
320
+ //! @param[in] stream
321
+ //! **[optional]** CUDA stream to launch kernels within.
322
+ //! Default is stream<sub>0</sub>.
323
+ template <typename KeyT, typename ValueT, typename NumItemsT>
324
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
325
+ void* d_temp_storage,
326
+ size_t& temp_storage_bytes,
327
+ const KeyT* d_keys_in,
328
+ KeyT* d_keys_out,
329
+ const ValueT* d_values_in,
330
+ ValueT* d_values_out,
331
+ NumItemsT num_items,
332
+ int begin_bit = 0,
333
+ int end_bit = sizeof(KeyT) * 8,
334
+ cudaStream_t stream = 0)
335
+ {
336
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
337
+ // Unsigned integer type for global offsets.
338
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
339
+
340
+ // TODO API that doesn't accept decomposer should also contain a static
341
+ // assert that the key type is fundamental.
342
+
343
+ // We cast away const-ness, but will *not* write to these arrays.
344
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
345
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
346
+ // is not set.
347
+ constexpr bool is_overwrite_okay = false;
348
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
349
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
350
+
351
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
352
+ d_temp_storage,
353
+ temp_storage_bytes,
354
+ d_keys,
355
+ d_values,
356
+ static_cast<OffsetT>(num_items),
357
+ begin_bit,
358
+ end_bit,
359
+ is_overwrite_okay,
360
+ stream);
361
+ }
362
+
363
+ //! @rst
364
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
365
+ //!
366
+ //! * The contents of the input data are not altered by the sorting operation.
367
+ //! * Pointers to contiguous memory must be used; iterators are not currently
368
+ //! supported.
369
+ //! * In-place operations are not supported. There must be no overlap between
370
+ //! any of the provided ranges:
371
+ //!
372
+ //! * ``[d_keys_in, d_keys_in + num_items)``
373
+ //! * ``[d_keys_out, d_keys_out + num_items)``
374
+ //! * ``[d_values_in, d_values_in + num_items)``
375
+ //! * ``[d_values_out, d_values_out + num_items)``
376
+ //!
377
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
378
+ //! differentiating key bits. This can reduce overall sorting overhead and
379
+ //! yield a corresponding performance improvement.
380
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
381
+ //! the sorting interface using DoubleBuffer wrappers below.
382
+ //! * @devicestorage
383
+ //!
384
+ //! Snippet
385
+ //! ==========================================================================
386
+ //!
387
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
388
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
389
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
390
+ //! tuple of references to relevant members of the key.
391
+ //!
392
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
393
+ //! :language: c++
394
+ //! :dedent:
395
+ //! :start-after: example-begin custom-type
396
+ //! :end-before: example-end custom-type
397
+ //!
398
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
399
+ //! using ``cub::DeviceRadixSort::SortPairs``:
400
+ //!
401
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
402
+ //! :language: c++
403
+ //! :dedent:
404
+ //! :start-after: example-begin pairs-bits
405
+ //! :end-before: example-end pairs-bits
406
+ //!
407
+ //! @endrst
408
+ //!
409
+ //! @tparam KeyT
410
+ //! **[inferred]** KeyT type
411
+ //!
412
+ //! @tparam ValueT
413
+ //! **[inferred]** ValueT type
414
+ //!
415
+ //! @tparam NumItemsT
416
+ //! **[inferred]** Type of num_items
417
+ //!
418
+ //! @tparam DecomposerT
419
+ //! **[inferred]** Type of a callable object responsible for decomposing a
420
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
421
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
422
+ //! The leftmost element of the tuple is considered the most significant.
423
+ //! The call operator must not modify members of the key.
424
+ //!
425
+ //! @param[in] d_temp_storage
426
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
427
+ //! required allocation size is written to `temp_storage_bytes` and no work
428
+ //! is done.
429
+ //!
430
+ //! @param[in,out] temp_storage_bytes
431
+ //! Reference to size in bytes of `d_temp_storage` allocation
432
+ //!
433
+ //! @param[in] d_keys_in
434
+ //! Pointer to the input data of key data to sort
435
+ //!
436
+ //! @param[out] d_keys_out
437
+ //! Pointer to the sorted output sequence of key data
438
+ //!
439
+ //! @param[in] d_values_in
440
+ //! Pointer to the corresponding input sequence of associated value items
441
+ //!
442
+ //! @param[out] d_values_out
443
+ //! Pointer to the correspondingly-reordered output sequence of associated
444
+ //! value items
445
+ //!
446
+ //! @param[in] num_items
447
+ //! Number of items to sort
448
+ //!
449
+ //! @param decomposer
450
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
451
+ //! references to its constituent arithmetic types. The leftmost element of
452
+ //! the tuple is considered the most significant. The call operator must not
453
+ //! modify members of the key.
454
+ //!
455
+ //! @param[in] begin_bit
456
+ //! **[optional]** The least-significant bit index (inclusive) needed for
457
+ //! key comparison
458
+ //!
459
+ //! @param[in] end_bit
460
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
461
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
462
+ //!
463
+ //! @param[in] stream
464
+ //! **[optional]** CUDA stream to launch kernels within.
465
+ //! Default is stream<sub>0</sub>.
466
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
467
+ CUB_RUNTIME_FUNCTION static //
468
+ ::cuda::std::enable_if_t< //
469
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
470
+ cudaError_t>
471
+ SortPairs(void* d_temp_storage,
472
+ size_t& temp_storage_bytes,
473
+ const KeyT* d_keys_in,
474
+ KeyT* d_keys_out,
475
+ const ValueT* d_values_in,
476
+ ValueT* d_values_out,
477
+ NumItemsT num_items,
478
+ DecomposerT decomposer,
479
+ int begin_bit,
480
+ int end_bit,
481
+ cudaStream_t stream = 0)
482
+ {
483
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
484
+ // unsigned integer type for global offsets
485
+ using offset_t = detail::choose_offset_t<NumItemsT>;
486
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
487
+
488
+ static_assert(decomposer_check_t::value,
489
+ "DecomposerT must be a callable object returning a tuple of references to "
490
+ "arithmetic types");
491
+
492
+ // We cast away const-ness, but will *not* write to these arrays.
493
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
494
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
495
+ // is not set.
496
+ constexpr bool is_overwrite_okay = false;
497
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
498
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
499
+
500
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
501
+ decomposer_check_t{},
502
+ d_temp_storage,
503
+ temp_storage_bytes,
504
+ is_overwrite_okay,
505
+ d_keys,
506
+ d_values,
507
+ static_cast<offset_t>(num_items),
508
+ decomposer,
509
+ begin_bit,
510
+ end_bit,
511
+ stream);
512
+ }
513
+
514
+ //! @rst
515
+ //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
516
+ //!
517
+ //! * The contents of the input data are not altered by the sorting operation.
518
+ //! * Pointers to contiguous memory must be used; iterators are not currently
519
+ //! supported.
520
+ //! * In-place operations are not supported. There must be no overlap between
521
+ //! any of the provided ranges:
522
+ //!
523
+ //! * ``[d_keys_in, d_keys_in + num_items)``
524
+ //! * ``[d_keys_out, d_keys_out + num_items)``
525
+ //! * ``[d_values_in, d_values_in + num_items)``
526
+ //! * ``[d_values_out, d_values_out + num_items)``
527
+ //!
528
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
529
+ //! the sorting interface using DoubleBuffer wrappers below.
530
+ //! * @devicestorage
531
+ //!
532
+ //! Snippet
533
+ //! ==========================================================================
534
+ //!
535
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
536
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
537
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
538
+ //! tuple of references to relevant members of the key.
539
+ //!
540
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
541
+ //! :language: c++
542
+ //! :dedent:
543
+ //! :start-after: example-begin custom-type
544
+ //! :end-before: example-end custom-type
545
+ //!
546
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
547
+ //! using ``cub::DeviceRadixSort::SortPairs``:
548
+ //!
549
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
550
+ //! :language: c++
551
+ //! :dedent:
552
+ //! :start-after: example-begin pairs
553
+ //! :end-before: example-end pairs
554
+ //!
555
+ //! @endrst
556
+ //!
557
+ //! @tparam KeyT
558
+ //! **[inferred]** KeyT type
559
+ //!
560
+ //! @tparam ValueT
561
+ //! **[inferred]** ValueT type
562
+ //!
563
+ //! @tparam NumItemsT
564
+ //! **[inferred]** Type of num_items
565
+ //!
566
+ //! @tparam DecomposerT
567
+ //! **[inferred]** Type of a callable object responsible for decomposing a
568
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
569
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
570
+ //! The leftmost element of the tuple is considered the most significant.
571
+ //! The call operator must not modify members of the key.
572
+ //!
573
+ //! @param[in] d_temp_storage
574
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
575
+ //! required allocation size is written to `temp_storage_bytes` and no work
576
+ //! is done.
577
+ //!
578
+ //! @param[in,out] temp_storage_bytes
579
+ //! Reference to size in bytes of `d_temp_storage` allocation
580
+ //!
581
+ //! @param[in] d_keys_in
582
+ //! Pointer to the input data of key data to sort
583
+ //!
584
+ //! @param[out] d_keys_out
585
+ //! Pointer to the sorted output sequence of key data
586
+ //!
587
+ //! @param[in] d_values_in
588
+ //! Pointer to the corresponding input sequence of associated value items
589
+ //!
590
+ //! @param[out] d_values_out
591
+ //! Pointer to the correspondingly-reordered output sequence of associated
592
+ //! value items
593
+ //!
594
+ //! @param[in] num_items
595
+ //! Number of items to sort
596
+ //!
597
+ //! @param decomposer
598
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
599
+ //! references to its constituent arithmetic types. The leftmost element of
600
+ //! the tuple is considered the most significant. The call operator must not
601
+ //! modify members of the key.
602
+ //!
603
+ //! @param[in] stream
604
+ //! **[optional]** CUDA stream to launch kernels within.
605
+ //! Default is stream<sub>0</sub>.
606
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
607
+ CUB_RUNTIME_FUNCTION static //
608
+ ::cuda::std::enable_if_t< //
609
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
610
+ cudaError_t>
611
+ SortPairs(void* d_temp_storage,
612
+ size_t& temp_storage_bytes,
613
+ const KeyT* d_keys_in,
614
+ KeyT* d_keys_out,
615
+ const ValueT* d_values_in,
616
+ ValueT* d_values_out,
617
+ NumItemsT num_items,
618
+ DecomposerT decomposer,
619
+ cudaStream_t stream = 0)
620
+ {
621
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
622
+ // unsigned integer type for global offsets
623
+ using offset_t = detail::choose_offset_t<NumItemsT>;
624
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
625
+
626
+ static_assert(decomposer_check_t::value,
627
+ "DecomposerT must be a callable object returning a tuple of references to "
628
+ "arithmetic types");
629
+
630
+ // We cast away const-ness, but will *not* write to these arrays.
631
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
632
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
633
+ // is not set.
634
+ constexpr bool is_overwrite_okay = false;
635
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
636
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
637
+
638
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
639
+ decomposer_check_t{},
640
+ d_temp_storage,
641
+ temp_storage_bytes,
642
+ is_overwrite_okay,
643
+ d_keys,
644
+ d_values,
645
+ static_cast<offset_t>(num_items),
646
+ decomposer,
647
+ stream);
648
+ }
649
+
650
+ //! @brief Sorts key-value pairs into ascending order.
651
+ //! (`~N` auxiliary storage required)
652
+ //!
653
+ //! @par
654
+ //! - The sorting operation is given a pair of key buffers and a corresponding
655
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
656
+ //! structure that indicates which of the two buffers is "current" (and thus
657
+ //! contains the input data to be sorted).
658
+ //! - The contents of both buffers within each pair may be altered by the
659
+ //! sorting operation.
660
+ //! - In-place operations are not supported. There must be no overlap between
661
+ //! any of the provided ranges:
662
+ //! - `[d_keys.Current(), d_keys.Current() + num_items)`
663
+ //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
664
+ //! - `[d_values.Current(), d_values.Current() + num_items)`
665
+ //! - `[d_values.Alternate(), d_values.Alternate() + num_items)`
666
+ //! - Upon completion, the sorting operation will update the "current"
667
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
668
+ //! buffers now contains the sorted output sequence (a function of the
669
+ //! number of key bits specified and the targeted device architecture).
670
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
671
+ //! bits can be specified. This can reduce overall sorting overhead and
672
+ //! yield a corresponding performance improvement.
673
+ //! - @devicestorageP
674
+ //! - @devicestorage
675
+ //!
676
+ //! @par Performance
677
+ //! The following charts illustrate saturated sorting performance across
678
+ //! different CUDA architectures for uniform-random `uint32, uint32` and
679
+ //! `uint64, uint64` pairs, respectively.
680
+ //!
681
+ //! @image html lsb_radix_sort_int32_pairs.png
682
+ //! @image html lsb_radix_sort_int64_pairs.png
683
+ //!
684
+ //! @par Snippet
685
+ //! The code snippet below illustrates the sorting of a device vector of `int`
686
+ //! keys with associated vector of `int` values.
687
+ //! @par
688
+ //! @code
689
+ //! #include <cub/cub.cuh>
690
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
691
+ //!
692
+ //! // Declare, allocate, and initialize device-accessible pointers for
693
+ //! // sorting data
694
+ //! int num_items; // e.g., 7
695
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
696
+ //! int *d_key_alt_buf; // e.g., [ ... ]
697
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
698
+ //! int *d_value_alt_buf; // e.g., [ ... ]
699
+ //! ...
700
+ //!
701
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
702
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
703
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
704
+ //!
705
+ //! // Determine temporary device storage requirements
706
+ //! void *d_temp_storage = nullptr;
707
+ //! size_t temp_storage_bytes = 0;
708
+ //! cub::DeviceRadixSort::SortPairs(
709
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
710
+ //!
711
+ //! // Allocate temporary storage
712
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
713
+ //!
714
+ //! // Run sorting operation
715
+ //! cub::DeviceRadixSort::SortPairs(
716
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
717
+ //!
718
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
719
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
720
+ //!
721
+ //! @endcode
722
+ //!
723
+ //! @tparam KeyT
724
+ //! **[inferred]** KeyT type
725
+ //!
726
+ //! @tparam ValueT
727
+ //! **[inferred]** ValueT type
728
+ //!
729
+ //! @tparam NumItemsT
730
+ //! **[inferred]** Type of num_items
731
+ //!
732
+ //! @param[in] d_temp_storage
733
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
734
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
735
+ //!
736
+ //! @param[in,out] temp_storage_bytes
737
+ //! Reference to size in bytes of `d_temp_storage` allocation
738
+ //!
739
+ //! @param[in,out] d_keys
740
+ //! Reference to the double-buffer of keys whose "current" device-accessible
741
+ //! buffer contains the unsorted input keys and, upon return, is updated to
742
+ //! point to the sorted output keys
743
+ //!
744
+ //! @param[in,out] d_values
745
+ //! Double-buffer of values whose "current" device-accessible buffer
746
+ //! contains the unsorted input values and, upon return, is updated to point
747
+ //! to the sorted output values
748
+ //!
749
+ //! @param[in] num_items
750
+ //! Number of items to sort
751
+ //!
752
+ //! @param[in] begin_bit
753
+ //! **[optional]** The least-significant bit index (inclusive) needed for
754
+ //! key comparison
755
+ //!
756
+ //! @param[in] end_bit
757
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
758
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
759
+ //!
760
+ //! @param[in] stream
761
+ //! **[optional]** CUDA stream to launch kernels within.
762
+ //! Default is stream<sub>0</sub>.
763
+ template <typename KeyT, typename ValueT, typename NumItemsT>
764
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
765
+ void* d_temp_storage,
766
+ size_t& temp_storage_bytes,
767
+ DoubleBuffer<KeyT>& d_keys,
768
+ DoubleBuffer<ValueT>& d_values,
769
+ NumItemsT num_items,
770
+ int begin_bit = 0,
771
+ int end_bit = sizeof(KeyT) * 8,
772
+ cudaStream_t stream = 0)
773
+ {
774
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
775
+
776
+ // Unsigned integer type for global offsets.
777
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
778
+
779
+ constexpr bool is_overwrite_okay = true;
780
+
781
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
782
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
783
+ }
784
+
785
+ //! @rst
786
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
787
+ //!
788
+ //! * The sorting operation is given a pair of key buffers and a corresponding
789
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
790
+ //! structure that indicates which of the two buffers is "current" (and thus
791
+ //! contains the input data to be sorted).
792
+ //! * The contents of both buffers within each pair may be altered by the
793
+ //! sorting operation.
794
+ //! * In-place operations are not supported. There must be no overlap between
795
+ //! any of the provided ranges:
796
+ //!
797
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
798
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
799
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
800
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
801
+ //!
802
+ //! - Upon completion, the sorting operation will update the "current"
803
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
804
+ //! buffers now contains the sorted output sequence (a function of the
805
+ //! number of key bits specified and the targeted device architecture).
806
+ //! - @devicestorageP
807
+ //! - @devicestorage
808
+ //!
809
+ //! Snippet
810
+ //! ==========================================================================
811
+ //!
812
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
813
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
814
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
815
+ //! tuple of references to relevant members of the key.
816
+ //!
817
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
818
+ //! :language: c++
819
+ //! :dedent:
820
+ //! :start-after: example-begin custom-type
821
+ //! :end-before: example-end custom-type
822
+ //!
823
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
824
+ //! using ``cub::DeviceRadixSort::SortPairs``:
825
+ //!
826
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
827
+ //! :language: c++
828
+ //! :dedent:
829
+ //! :start-after: example-begin pairs-db
830
+ //! :end-before: example-end pairs-db
831
+ //!
832
+ //! @endrst
833
+ //!
834
+ //! @tparam KeyT
835
+ //! **[inferred]** KeyT type
836
+ //!
837
+ //! @tparam ValueT
838
+ //! **[inferred]** ValueT type
839
+ //!
840
+ //! @tparam NumItemsT
841
+ //! **[inferred]** Type of num_items
842
+ //!
843
+ //! @tparam DecomposerT
844
+ //! **[inferred]** Type of a callable object responsible for decomposing a
845
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
846
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
847
+ //! The leftmost element of the tuple is considered the most significant.
848
+ //! The call operator must not modify members of the key.
849
+ //!
850
+ //! @param[in] d_temp_storage
851
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
852
+ //! required allocation size is written to `temp_storage_bytes` and no work
853
+ //! is done.
854
+ //!
855
+ //! @param[in,out] temp_storage_bytes
856
+ //! Reference to size in bytes of `d_temp_storage` allocation
857
+ //!
858
+ //! @param[in,out] d_keys
859
+ //! Reference to the double-buffer of keys whose "current" device-accessible
860
+ //! buffer contains the unsorted input keys and, upon return, is updated to
861
+ //! point to the sorted output keys
862
+ //!
863
+ //! @param[in,out] d_values
864
+ //! Double-buffer of values whose "current" device-accessible buffer
865
+ //! contains the unsorted input values and, upon return, is updated to point
866
+ //! to the sorted output values
867
+ //!
868
+ //! @param[in] num_items
869
+ //! Number of items to sort
870
+ //!
871
+ //! @param decomposer
872
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
873
+ //! references to its constituent arithmetic types. The leftmost element of
874
+ //! the tuple is considered the most significant. The call operator must not
875
+ //! modify members of the key.
876
+ //!
877
+ //! @param[in] stream
878
+ //! **[optional]** CUDA stream to launch kernels within.
879
+ //! Default is stream<sub>0</sub>.
880
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
881
+ CUB_RUNTIME_FUNCTION static //
882
+ ::cuda::std::enable_if_t< //
883
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
884
+ cudaError_t>
885
+ SortPairs(void* d_temp_storage,
886
+ size_t& temp_storage_bytes,
887
+ DoubleBuffer<KeyT>& d_keys,
888
+ DoubleBuffer<ValueT>& d_values,
889
+ NumItemsT num_items,
890
+ DecomposerT decomposer,
891
+ cudaStream_t stream = 0)
892
+ {
893
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
894
+
895
+ // unsigned integer type for global offsets
896
+ using offset_t = detail::choose_offset_t<NumItemsT>;
897
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
898
+
899
+ static_assert(decomposer_check_t::value,
900
+ "DecomposerT must be a callable object returning a tuple of references to "
901
+ "arithmetic types");
902
+
903
+ constexpr bool is_overwrite_okay = true;
904
+
905
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
906
+ decomposer_check_t{},
907
+ d_temp_storage,
908
+ temp_storage_bytes,
909
+ is_overwrite_okay,
910
+ d_keys,
911
+ d_values,
912
+ static_cast<offset_t>(num_items),
913
+ decomposer,
914
+ stream);
915
+ }
916
+
917
+ //! @rst
918
+ //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
919
+ //!
920
+ //! * The sorting operation is given a pair of key buffers and a corresponding
921
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
922
+ //! structure that indicates which of the two buffers is "current" (and thus
923
+ //! contains the input data to be sorted).
924
+ //! * The contents of both buffers within each pair may be altered by the
925
+ //! sorting operation.
926
+ //! * In-place operations are not supported. There must be no overlap between
927
+ //! any of the provided ranges:
928
+ //!
929
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
930
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
931
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
932
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
933
+ //!
934
+ //! - Upon completion, the sorting operation will update the "current"
935
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
936
+ //! buffers now contains the sorted output sequence (a function of the
937
+ //! number of key bits specified and the targeted device architecture).
938
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
939
+ //! bits can be specified. This can reduce overall sorting overhead and
940
+ //! yield a corresponding performance improvement.
941
+ //! - @devicestorageP
942
+ //! - @devicestorage
943
+ //!
944
+ //! Snippet
945
+ //! ==========================================================================
946
+ //!
947
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
948
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
949
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
950
+ //! tuple of references to relevant members of the key.
951
+ //!
952
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
953
+ //! :language: c++
954
+ //! :dedent:
955
+ //! :start-after: example-begin custom-type
956
+ //! :end-before: example-end custom-type
957
+ //!
958
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
959
+ //! using ``cub::DeviceRadixSort::SortPairs``:
960
+ //!
961
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
962
+ //! :language: c++
963
+ //! :dedent:
964
+ //! :start-after: example-begin pairs-bits-db
965
+ //! :end-before: example-end pairs-bits-db
966
+ //!
967
+ //! @endrst
968
+ //!
969
+ //! @tparam KeyT
970
+ //! **[inferred]** KeyT type
971
+ //!
972
+ //! @tparam ValueT
973
+ //! **[inferred]** ValueT type
974
+ //!
975
+ //! @tparam NumItemsT
976
+ //! **[inferred]** Type of num_items
977
+ //!
978
+ //! @tparam DecomposerT
979
+ //! **[inferred]** Type of a callable object responsible for decomposing a
980
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
981
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
982
+ //! The leftmost element of the tuple is considered the most significant.
983
+ //! The call operator must not modify members of the key.
984
+ //!
985
+ //! @param[in] d_temp_storage
986
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
987
+ //! required allocation size is written to `temp_storage_bytes` and no work
988
+ //! is done.
989
+ //!
990
+ //! @param[in,out] temp_storage_bytes
991
+ //! Reference to size in bytes of `d_temp_storage` allocation
992
+ //!
993
+ //! @param[in,out] d_keys
994
+ //! Reference to the double-buffer of keys whose "current" device-accessible
995
+ //! buffer contains the unsorted input keys and, upon return, is updated to
996
+ //! point to the sorted output keys
997
+ //!
998
+ //! @param[in,out] d_values
999
+ //! Double-buffer of values whose "current" device-accessible buffer
1000
+ //! contains the unsorted input values and, upon return, is updated to point
1001
+ //! to the sorted output values
1002
+ //!
1003
+ //! @param[in] num_items
1004
+ //! Number of items to sort
1005
+ //!
1006
+ //! @param decomposer
1007
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1008
+ //! references to its constituent arithmetic types. The leftmost element of
1009
+ //! the tuple is considered the most significant. The call operator must not
1010
+ //! modify members of the key.
1011
+ //!
1012
+ //! @param[in] begin_bit
1013
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1014
+ //! key comparison
1015
+ //!
1016
+ //! @param[in] end_bit
1017
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1018
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1019
+ //!
1020
+ //! @param[in] stream
1021
+ //! **[optional]** CUDA stream to launch kernels within.
1022
+ //! Default is stream<sub>0</sub>.
1023
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1024
+ CUB_RUNTIME_FUNCTION static //
1025
+ ::cuda::std::enable_if_t< //
1026
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1027
+ cudaError_t>
1028
+ SortPairs(void* d_temp_storage,
1029
+ size_t& temp_storage_bytes,
1030
+ DoubleBuffer<KeyT>& d_keys,
1031
+ DoubleBuffer<ValueT>& d_values,
1032
+ NumItemsT num_items,
1033
+ DecomposerT decomposer,
1034
+ int begin_bit,
1035
+ int end_bit,
1036
+ cudaStream_t stream = 0)
1037
+ {
1038
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1039
+
1040
+ // unsigned integer type for global offsets
1041
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1042
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1043
+
1044
+ static_assert(decomposer_check_t::value,
1045
+ "DecomposerT must be a callable object returning a tuple of references to "
1046
+ "arithmetic types");
1047
+
1048
+ constexpr bool is_overwrite_okay = true;
1049
+
1050
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
1051
+ decomposer_check_t{},
1052
+ d_temp_storage,
1053
+ temp_storage_bytes,
1054
+ is_overwrite_okay,
1055
+ d_keys,
1056
+ d_values,
1057
+ static_cast<offset_t>(num_items),
1058
+ decomposer,
1059
+ begin_bit,
1060
+ end_bit,
1061
+ stream);
1062
+ }
1063
+
1064
+ //! @brief Sorts key-value pairs into descending order.
1065
+ //! (`~2N` auxiliary storage required).
1066
+ //!
1067
+ //! @par
1068
+ //! - The contents of the input data are not altered by the sorting operation.
1069
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1070
+ //! supported.
1071
+ //! - In-place operations are not supported. There must be no overlap between
1072
+ //! any of the provided ranges:
1073
+ //! - `[d_keys_in, d_keys_in + num_items)`
1074
+ //! - `[d_keys_out, d_keys_out + num_items)`
1075
+ //! - `[d_values_in, d_values_in + num_items)`
1076
+ //! - `[d_values_out, d_values_out + num_items)`
1077
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
1078
+ //! bits can be specified. This can reduce overall sorting overhead and
1079
+ //! yield a corresponding performance improvement.
1080
+ //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
1081
+ //! the sorting interface using DoubleBuffer wrappers below.
1082
+ //! - @devicestorage
1083
+ //!
1084
+ //! @par Performance
1085
+ //! Performance is similar to DeviceRadixSort::SortPairs.
1086
+ //!
1087
+ //! @par Snippet
1088
+ //! The code snippet below illustrates the sorting of a device vector of `int`
1089
+ //! keys with associated vector of `int` values.
1090
+ //! @par
1091
+ //! @code
1092
+ //! #include <cub/cub.cuh>
1093
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1094
+ //!
1095
+ //! // Declare, allocate, and initialize device-accessible pointers
1096
+ //! // for sorting data
1097
+ //! int num_items; // e.g., 7
1098
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1099
+ //! int *d_keys_out; // e.g., [ ... ]
1100
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1101
+ //! int *d_values_out; // e.g., [ ... ]
1102
+ //! ...
1103
+ //!
1104
+ //! // Determine temporary device storage requirements
1105
+ //! void *d_temp_storage = nullptr;
1106
+ //! size_t temp_storage_bytes = 0;
1107
+ //! cub::DeviceRadixSort::SortPairsDescending(
1108
+ //! d_temp_storage, temp_storage_bytes,
1109
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1110
+ //!
1111
+ //! // Allocate temporary storage
1112
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1113
+ //!
1114
+ //! // Run sorting operation
1115
+ //! cub::DeviceRadixSort::SortPairsDescending(
1116
+ //! d_temp_storage, temp_storage_bytes,
1117
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
1118
+ //!
1119
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
1120
+ //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
1121
+ //! @endcode
1122
+ //!
1123
+ //! @tparam KeyT
1124
+ //! **[inferred]** KeyT type
1125
+ //!
1126
+ //! @tparam ValueT
1127
+ //! **[inferred]** ValueT type
1128
+ //!
1129
+ //! @tparam NumItemsT
1130
+ //! **[inferred]** Type of num_items
1131
+ //!
1132
+ //! @param[in] d_temp_storage
1133
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1134
+ //! required allocation size is written to `temp_storage_bytes` and no work
1135
+ //! is done.
1136
+ //!
1137
+ //! @param[in,out] temp_storage_bytes
1138
+ //! Reference to size in bytes of `d_temp_storage` allocation
1139
+ //!
1140
+ //! @param[in] d_keys_in
1141
+ //! Pointer to the input data of key data to sort
1142
+ //!
1143
+ //! @param[out] d_keys_out
1144
+ //! Pointer to the sorted output sequence of key data
1145
+ //!
1146
+ //! @param[in] d_values_in
1147
+ //! Pointer to the corresponding input sequence of associated value items
1148
+ //!
1149
+ //! @param[out] d_values_out
1150
+ //! Pointer to the correspondingly-reordered output sequence of associated
1151
+ //! value items
1152
+ //!
1153
+ //! @param[in] num_items
1154
+ //! Number of items to sort
1155
+ //!
1156
+ //! @param[in] begin_bit
1157
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1158
+ //! key comparison
1159
+ //!
1160
+ //! @param[in] end_bit
1161
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1162
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
1163
+ //!
1164
+ //! @param[in] stream
1165
+ //! **[optional]** CUDA stream to launch kernels within.
1166
+ //! Default is stream<sub>0</sub>.
1167
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1168
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1169
+ void* d_temp_storage,
1170
+ size_t& temp_storage_bytes,
1171
+ const KeyT* d_keys_in,
1172
+ KeyT* d_keys_out,
1173
+ const ValueT* d_values_in,
1174
+ ValueT* d_values_out,
1175
+ NumItemsT num_items,
1176
+ int begin_bit = 0,
1177
+ int end_bit = sizeof(KeyT) * 8,
1178
+ cudaStream_t stream = 0)
1179
+ {
1180
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1181
+
1182
+ // Unsigned integer type for global offsets.
1183
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1184
+
1185
+ // We cast away const-ness, but will *not* write to these arrays.
1186
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
1187
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
1188
+ // is not set.
1189
+ constexpr bool is_overwrite_okay = false;
1190
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1191
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1192
+
1193
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1194
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1195
+ }
1196
+
1197
+ //! @rst
1198
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1199
+ //!
1200
+ //! * The contents of the input data are not altered by the sorting operation.
1201
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1202
+ //! supported.
1203
+ //! * In-place operations are not supported. There must be no overlap between
1204
+ //! any of the provided ranges:
1205
+ //!
1206
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1207
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1208
+ //! * ``[d_values_in, d_values_in + num_items)``
1209
+ //! * ``[d_values_out, d_values_out + num_items)``
1210
+ //!
1211
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
1212
+ //! differentiating key bits. This can reduce overall sorting overhead and
1213
+ //! yield a corresponding performance improvement.
1214
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1215
+ //! the sorting interface using DoubleBuffer wrappers below.
1216
+ //! * @devicestorage
1217
+ //!
1218
+ //! Snippet
1219
+ //! ==========================================================================
1220
+ //!
1221
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1222
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1223
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1224
+ //! tuple of references to relevant members of the key.
1225
+ //!
1226
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1227
+ //! :language: c++
1228
+ //! :dedent:
1229
+ //! :start-after: example-begin custom-type
1230
+ //! :end-before: example-end custom-type
1231
+ //!
1232
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1233
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1234
+ //!
1235
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1236
+ //! :language: c++
1237
+ //! :dedent:
1238
+ //! :start-after: example-begin pairs-descending-bits
1239
+ //! :end-before: example-end pairs-descending-bits
1240
+ //!
1241
+ //! @endrst
1242
+ //!
1243
+ //! @tparam KeyT
1244
+ //! **[inferred]** KeyT type
1245
+ //!
1246
+ //! @tparam ValueT
1247
+ //! **[inferred]** ValueT type
1248
+ //!
1249
+ //! @tparam NumItemsT
1250
+ //! **[inferred]** Type of num_items
1251
+ //!
1252
+ //! @tparam DecomposerT
1253
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1254
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1255
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1256
+ //! The leftmost element of the tuple is considered the most significant.
1257
+ //! The call operator must not modify members of the key.
1258
+ //!
1259
+ //! @param[in] d_temp_storage
1260
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1261
+ //! required allocation size is written to `temp_storage_bytes` and no work
1262
+ //! is done.
1263
+ //!
1264
+ //! @param[in,out] temp_storage_bytes
1265
+ //! Reference to size in bytes of `d_temp_storage` allocation
1266
+ //!
1267
+ //! @param[in] d_keys_in
1268
+ //! Pointer to the input data of key data to sort
1269
+ //!
1270
+ //! @param[out] d_keys_out
1271
+ //! Pointer to the sorted output sequence of key data
1272
+ //!
1273
+ //! @param[in] d_values_in
1274
+ //! Pointer to the corresponding input sequence of associated value items
1275
+ //!
1276
+ //! @param[out] d_values_out
1277
+ //! Pointer to the correspondingly-reordered output sequence of associated
1278
+ //! value items
1279
+ //!
1280
+ //! @param[in] num_items
1281
+ //! Number of items to sort
1282
+ //!
1283
+ //! @param decomposer
1284
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1285
+ //! references to its constituent arithmetic types. The leftmost element of
1286
+ //! the tuple is considered the most significant. The call operator must not
1287
+ //! modify members of the key.
1288
+ //!
1289
+ //! @param[in] begin_bit
1290
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1291
+ //! key comparison
1292
+ //!
1293
+ //! @param[in] end_bit
1294
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1295
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1296
+ //!
1297
+ //! @param[in] stream
1298
+ //! **[optional]** CUDA stream to launch kernels within.
1299
+ //! Default is stream<sub>0</sub>.
1300
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1301
+ CUB_RUNTIME_FUNCTION static //
1302
+ ::cuda::std::enable_if_t< //
1303
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1304
+ cudaError_t>
1305
+ SortPairsDescending(
1306
+ void* d_temp_storage,
1307
+ size_t& temp_storage_bytes,
1308
+ const KeyT* d_keys_in,
1309
+ KeyT* d_keys_out,
1310
+ const ValueT* d_values_in,
1311
+ ValueT* d_values_out,
1312
+ NumItemsT num_items,
1313
+ DecomposerT decomposer,
1314
+ int begin_bit,
1315
+ int end_bit,
1316
+ cudaStream_t stream = 0)
1317
+ {
1318
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1319
+
1320
+ // unsigned integer type for global offsets
1321
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1322
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1323
+
1324
+ static_assert(decomposer_check_t::value,
1325
+ "DecomposerT must be a callable object returning a tuple of references to "
1326
+ "arithmetic types");
1327
+
1328
+ // We cast away const-ness, but will *not* write to these arrays.
1329
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
1330
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
1331
+ // is not set.
1332
+ constexpr bool is_overwrite_okay = false;
1333
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1334
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1335
+
1336
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1337
+ decomposer_check_t{},
1338
+ d_temp_storage,
1339
+ temp_storage_bytes,
1340
+ is_overwrite_okay,
1341
+ d_keys,
1342
+ d_values,
1343
+ static_cast<offset_t>(num_items),
1344
+ decomposer,
1345
+ begin_bit,
1346
+ end_bit,
1347
+ stream);
1348
+ }
1349
+
1350
+ //! @rst
1351
+ //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
1352
+ //!
1353
+ //! * The contents of the input data are not altered by the sorting operation.
1354
+ //! * Pointers to contiguous memory must be used; iterators are not currently
1355
+ //! supported.
1356
+ //! * In-place operations are not supported. There must be no overlap between
1357
+ //! any of the provided ranges:
1358
+ //!
1359
+ //! * ``[d_keys_in, d_keys_in + num_items)``
1360
+ //! * ``[d_keys_out, d_keys_out + num_items)``
1361
+ //! * ``[d_values_in, d_values_in + num_items)``
1362
+ //! * ``[d_values_out, d_values_out + num_items)``
1363
+ //!
1364
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
1365
+ //! the sorting interface using DoubleBuffer wrappers below.
1366
+ //! * @devicestorage
1367
+ //!
1368
+ //! Snippet
1369
+ //! ==========================================================================
1370
+ //!
1371
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1372
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1373
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1374
+ //! tuple of references to relevant members of the key.
1375
+ //!
1376
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1377
+ //! :language: c++
1378
+ //! :dedent:
1379
+ //! :start-after: example-begin custom-type
1380
+ //! :end-before: example-end custom-type
1381
+ //!
1382
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1383
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1384
+ //!
1385
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1386
+ //! :language: c++
1387
+ //! :dedent:
1388
+ //! :start-after: example-begin pairs-descending
1389
+ //! :end-before: example-end pairs-descending
1390
+ //!
1391
+ //! @endrst
1392
+ //!
1393
+ //! @tparam KeyT
1394
+ //! **[inferred]** KeyT type
1395
+ //!
1396
+ //! @tparam ValueT
1397
+ //! **[inferred]** ValueT type
1398
+ //!
1399
+ //! @tparam NumItemsT
1400
+ //! **[inferred]** Type of num_items
1401
+ //!
1402
+ //! @tparam DecomposerT
1403
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1404
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1405
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1406
+ //! The leftmost element of the tuple is considered the most significant.
1407
+ //! The call operator must not modify members of the key.
1408
+ //!
1409
+ //! @param[in] d_temp_storage
1410
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1411
+ //! required allocation size is written to `temp_storage_bytes` and no work
1412
+ //! is done.
1413
+ //!
1414
+ //! @param[in,out] temp_storage_bytes
1415
+ //! Reference to size in bytes of `d_temp_storage` allocation
1416
+ //!
1417
+ //! @param[in] d_keys_in
1418
+ //! Pointer to the input data of key data to sort
1419
+ //!
1420
+ //! @param[out] d_keys_out
1421
+ //! Pointer to the sorted output sequence of key data
1422
+ //!
1423
+ //! @param[in] d_values_in
1424
+ //! Pointer to the corresponding input sequence of associated value items
1425
+ //!
1426
+ //! @param[out] d_values_out
1427
+ //! Pointer to the correspondingly-reordered output sequence of associated
1428
+ //! value items
1429
+ //!
1430
+ //! @param[in] num_items
1431
+ //! Number of items to sort
1432
+ //!
1433
+ //! @param decomposer
1434
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1435
+ //! references to its constituent arithmetic types. The leftmost element of
1436
+ //! the tuple is considered the most significant. The call operator must not
1437
+ //! modify members of the key.
1438
+ //!
1439
+ //! @param[in] stream
1440
+ //! **[optional]** CUDA stream to launch kernels within.
1441
+ //! Default is stream<sub>0</sub>.
1442
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1443
+ CUB_RUNTIME_FUNCTION static //
1444
+ ::cuda::std::enable_if_t< //
1445
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1446
+ cudaError_t>
1447
+ SortPairsDescending(
1448
+ void* d_temp_storage,
1449
+ size_t& temp_storage_bytes,
1450
+ const KeyT* d_keys_in,
1451
+ KeyT* d_keys_out,
1452
+ const ValueT* d_values_in,
1453
+ ValueT* d_values_out,
1454
+ NumItemsT num_items,
1455
+ DecomposerT decomposer,
1456
+ cudaStream_t stream = 0)
1457
+ {
1458
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1459
+
1460
+ // unsigned integer type for global offsets
1461
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1462
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1463
+
1464
+ static_assert(decomposer_check_t::value,
1465
+ "DecomposerT must be a callable object returning a tuple of references to "
1466
+ "arithmetic types");
1467
+
1468
+ // We cast away const-ness, but will *not* write to these arrays.
1469
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
1470
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
1471
+ // is not set.
1472
+ constexpr bool is_overwrite_okay = false;
1473
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1474
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1475
+
1476
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1477
+ decomposer_check_t{},
1478
+ d_temp_storage,
1479
+ temp_storage_bytes,
1480
+ is_overwrite_okay,
1481
+ d_keys,
1482
+ d_values,
1483
+ static_cast<offset_t>(num_items),
1484
+ decomposer,
1485
+ stream);
1486
+ }
1487
+
1488
+ //! @brief Sorts key-value pairs into descending order.
1489
+ //! (`~N` auxiliary storage required).
1490
+ //!
1491
+ //! @par
1492
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1493
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1494
+ //! structure that indicates which of the two buffers is "current" (and thus
1495
+ //! contains the input data to be sorted).
1496
+ //! - The contents of both buffers within each pair may be altered by the
1497
+ //! sorting operation.
1498
+ //! - In-place operations are not supported. There must be no overlap between
1499
+ //! any of the provided ranges:
1500
+ //! - `[d_keys.Current(), d_keys.Current() + num_items)`
1501
+ //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
1502
+ //! - `[d_values.Current(), d_values.Current() + num_items)`
1503
+ //! - `[d_values.Alternate(), d_values.Alternate() + num_items)`
1504
+ //! - Upon completion, the sorting operation will update the "current"
1505
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1506
+ //! buffers now contains the sorted output sequence (a function of the number
1507
+ //! of key bits specified and the targeted device architecture).
1508
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
1509
+ //! bits can be specified. This can reduce overall sorting overhead and
1510
+ //! yield a corresponding performance improvement.
1511
+ //! - @devicestorageP
1512
+ //! - @devicestorage
1513
+ //!
1514
+ //! @par Performance
1515
+ //! Performance is similar to DeviceRadixSort::SortPairs.
1516
+ //!
1517
+ //! @par Snippet
1518
+ //! The code snippet below illustrates the sorting of a device vector of `int`
1519
+ //! keys with associated vector of `int` values.
1520
+ //! @par
1521
+ //! @code
1522
+ //! #include <cub/cub.cuh>
1523
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1524
+ //!
1525
+ //! // Declare, allocate, and initialize device-accessible pointers
1526
+ //! // for sorting data
1527
+ //! int num_items; // e.g., 7
1528
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1529
+ //! int *d_key_alt_buf; // e.g., [ ... ]
1530
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1531
+ //! int *d_value_alt_buf; // e.g., [ ... ]
1532
+ //! ...
1533
+ //!
1534
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1535
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1536
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1537
+ //!
1538
+ //! // Determine temporary device storage requirements
1539
+ //! void *d_temp_storage = nullptr;
1540
+ //! size_t temp_storage_bytes = 0;
1541
+ //! cub::DeviceRadixSort::SortPairsDescending(
1542
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1543
+ //!
1544
+ //! // Allocate temporary storage
1545
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1546
+ //!
1547
+ //! // Run sorting operation
1548
+ //! cub::DeviceRadixSort::SortPairsDescending(
1549
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
1550
+ //!
1551
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
1552
+ //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
1553
+ //! @endcode
1554
+ //!
1555
+ //! @tparam KeyT
1556
+ //! **[inferred]** KeyT type
1557
+ //!
1558
+ //! @tparam ValueT
1559
+ //! **[inferred]** ValueT type
1560
+ //!
1561
+ //! @tparam NumItemsT
1562
+ //! **[inferred]** Type of num_items
1563
+ //!
1564
+ //! @param[in] d_temp_storage
1565
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1566
+ //! required allocation size is written to `temp_storage_bytes` and no work
1567
+ //! is done.
1568
+ //!
1569
+ //! @param[in,out] temp_storage_bytes
1570
+ //! Reference to size in bytes of `d_temp_storage` allocation
1571
+ //!
1572
+ //! @param[in,out] d_keys
1573
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1574
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1575
+ //! point to the sorted output keys
1576
+ //!
1577
+ //! @param[in,out] d_values
1578
+ //! Double-buffer of values whose "current" device-accessible buffer
1579
+ //! contains the unsorted input values and, upon return, is updated to point
1580
+ //! to the sorted output values
1581
+ //!
1582
+ //! @param[in] num_items
1583
+ //! Number of items to sort
1584
+ //!
1585
+ //! @param[in] begin_bit
1586
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1587
+ //! key comparison
1588
+ //!
1589
+ //! @param[in] end_bit
1590
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1591
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
1592
+ //!
1593
+ //! @param[in] stream
1594
+ //! **[optional]** CUDA stream to launch kernels within.
1595
+ //! Default is stream<sub>0</sub>.
1596
+ template <typename KeyT, typename ValueT, typename NumItemsT>
1597
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1598
+ void* d_temp_storage,
1599
+ size_t& temp_storage_bytes,
1600
+ DoubleBuffer<KeyT>& d_keys,
1601
+ DoubleBuffer<ValueT>& d_values,
1602
+ NumItemsT num_items,
1603
+ int begin_bit = 0,
1604
+ int end_bit = sizeof(KeyT) * 8,
1605
+ cudaStream_t stream = 0)
1606
+ {
1607
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1608
+
1609
+ // Unsigned integer type for global offsets.
1610
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
1611
+
1612
+ constexpr bool is_overwrite_okay = true;
1613
+
1614
+ return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
1615
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
1616
+ }
1617
+
1618
+ //! @rst
1619
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1620
+ //!
1621
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1622
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1623
+ //! structure that indicates which of the two buffers is "current" (and thus
1624
+ //! contains the input data to be sorted).
1625
+ //! * The contents of both buffers within each pair may be altered by the
1626
+ //! sorting operation.
1627
+ //! * In-place operations are not supported. There must be no overlap between
1628
+ //! any of the provided ranges:
1629
+ //!
1630
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1631
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1632
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1633
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1634
+ //!
1635
+ //! - Upon completion, the sorting operation will update the "current"
1636
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1637
+ //! buffers now contains the sorted output sequence (a function of the
1638
+ //! number of key bits specified and the targeted device architecture).
1639
+ //! - @devicestorageP
1640
+ //! - @devicestorage
1641
+ //!
1642
+ //! Snippet
1643
+ //! ==========================================================================
1644
+ //!
1645
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1646
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1647
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1648
+ //! tuple of references to relevant members of the key.
1649
+ //!
1650
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1651
+ //! :language: c++
1652
+ //! :dedent:
1653
+ //! :start-after: example-begin custom-type
1654
+ //! :end-before: example-end custom-type
1655
+ //!
1656
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1657
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1658
+ //!
1659
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1660
+ //! :language: c++
1661
+ //! :dedent:
1662
+ //! :start-after: example-begin pairs-descending-db
1663
+ //! :end-before: example-end pairs-descending-db
1664
+ //!
1665
+ //! @endrst
1666
+ //!
1667
+ //! @tparam KeyT
1668
+ //! **[inferred]** KeyT type
1669
+ //!
1670
+ //! @tparam ValueT
1671
+ //! **[inferred]** ValueT type
1672
+ //!
1673
+ //! @tparam NumItemsT
1674
+ //! **[inferred]** Type of num_items
1675
+ //!
1676
+ //! @tparam DecomposerT
1677
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1678
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1679
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1680
+ //! The leftmost element of the tuple is considered the most significant.
1681
+ //! The call operator must not modify members of the key.
1682
+ //!
1683
+ //! @param[in] d_temp_storage
1684
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1685
+ //! required allocation size is written to `temp_storage_bytes` and no work
1686
+ //! is done.
1687
+ //!
1688
+ //! @param[in,out] temp_storage_bytes
1689
+ //! Reference to size in bytes of `d_temp_storage` allocation
1690
+ //!
1691
+ //! @param[in,out] d_keys
1692
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1693
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1694
+ //! point to the sorted output keys
1695
+ //!
1696
+ //! @param[in,out] d_values
1697
+ //! Double-buffer of values whose "current" device-accessible buffer
1698
+ //! contains the unsorted input values and, upon return, is updated to point
1699
+ //! to the sorted output values
1700
+ //!
1701
+ //! @param[in] num_items
1702
+ //! Number of items to sort
1703
+ //!
1704
+ //! @param decomposer
1705
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1706
+ //! references to its constituent arithmetic types. The leftmost element of
1707
+ //! the tuple is considered the most significant. The call operator must not
1708
+ //! modify members of the key.
1709
+ //!
1710
+ //! @param[in] stream
1711
+ //! **[optional]** CUDA stream to launch kernels within.
1712
+ //! Default is stream<sub>0</sub>.
1713
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1714
+ CUB_RUNTIME_FUNCTION static //
1715
+ ::cuda::std::enable_if_t< //
1716
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1717
+ cudaError_t>
1718
+ SortPairsDescending(
1719
+ void* d_temp_storage,
1720
+ size_t& temp_storage_bytes,
1721
+ DoubleBuffer<KeyT>& d_keys,
1722
+ DoubleBuffer<ValueT>& d_values,
1723
+ NumItemsT num_items,
1724
+ DecomposerT decomposer,
1725
+ cudaStream_t stream = 0)
1726
+ {
1727
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1728
+
1729
+ // unsigned integer type for global offsets
1730
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1731
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1732
+
1733
+ static_assert(decomposer_check_t::value,
1734
+ "DecomposerT must be a callable object returning a tuple of references to "
1735
+ "arithmetic types");
1736
+
1737
+ constexpr bool is_overwrite_okay = true;
1738
+
1739
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1740
+ decomposer_check_t{},
1741
+ d_temp_storage,
1742
+ temp_storage_bytes,
1743
+ is_overwrite_okay,
1744
+ d_keys,
1745
+ d_values,
1746
+ static_cast<offset_t>(num_items),
1747
+ decomposer,
1748
+ stream);
1749
+ }
1750
+
1751
+ //! @rst
1752
+ //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
1753
+ //!
1754
+ //! * The sorting operation is given a pair of key buffers and a corresponding
1755
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1756
+ //! structure that indicates which of the two buffers is "current" (and thus
1757
+ //! contains the input data to be sorted).
1758
+ //! * The contents of both buffers within each pair may be altered by the
1759
+ //! sorting operation.
1760
+ //! * In-place operations are not supported. There must be no overlap between
1761
+ //! any of the provided ranges:
1762
+ //!
1763
+ //! - ``[d_keys.Current(), d_keys.Current() + num_items)``
1764
+ //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
1765
+ //! - ``[d_values.Current(), d_values.Current() + num_items)``
1766
+ //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
1767
+ //!
1768
+ //! - Upon completion, the sorting operation will update the "current"
1769
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1770
+ //! buffers now contains the sorted output sequence (a function of the
1771
+ //! number of key bits specified and the targeted device architecture).
1772
+ //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
1773
+ //! bits can be specified. This can reduce overall sorting overhead and
1774
+ //! yield a corresponding performance improvement.
1775
+ //! - @devicestorageP
1776
+ //! - @devicestorage
1777
+ //!
1778
+ //! Snippet
1779
+ //! ==========================================================================
1780
+ //!
1781
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1782
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1783
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1784
+ //! tuple of references to relevant members of the key.
1785
+ //!
1786
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1787
+ //! :language: c++
1788
+ //! :dedent:
1789
+ //! :start-after: example-begin custom-type
1790
+ //! :end-before: example-end custom-type
1791
+ //!
1792
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
1793
+ //! using ``cub::DeviceRadixSort::SortPairsDescending``:
1794
+ //!
1795
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
1796
+ //! :language: c++
1797
+ //! :dedent:
1798
+ //! :start-after: example-begin pairs-descending-bits-db
1799
+ //! :end-before: example-end pairs-descending-bits-db
1800
+ //!
1801
+ //! @endrst
1802
+ //!
1803
+ //! @tparam KeyT
1804
+ //! **[inferred]** KeyT type
1805
+ //!
1806
+ //! @tparam ValueT
1807
+ //! **[inferred]** ValueT type
1808
+ //!
1809
+ //! @tparam NumItemsT
1810
+ //! **[inferred]** Type of num_items
1811
+ //!
1812
+ //! @tparam DecomposerT
1813
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1814
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1815
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1816
+ //! The leftmost element of the tuple is considered the most significant.
1817
+ //! The call operator must not modify members of the key.
1818
+ //!
1819
+ //! @param[in] d_temp_storage
1820
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1821
+ //! required allocation size is written to `temp_storage_bytes` and no work
1822
+ //! is done.
1823
+ //!
1824
+ //! @param[in,out] temp_storage_bytes
1825
+ //! Reference to size in bytes of `d_temp_storage` allocation
1826
+ //!
1827
+ //! @param[in,out] d_keys
1828
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1829
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1830
+ //! point to the sorted output keys
1831
+ //!
1832
+ //! @param[in,out] d_values
1833
+ //! Double-buffer of values whose "current" device-accessible buffer
1834
+ //! contains the unsorted input values and, upon return, is updated to point
1835
+ //! to the sorted output values
1836
+ //!
1837
+ //! @param[in] num_items
1838
+ //! Number of items to sort
1839
+ //!
1840
+ //! @param decomposer
1841
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1842
+ //! references to its constituent arithmetic types. The leftmost element of
1843
+ //! the tuple is considered the most significant. The call operator must not
1844
+ //! modify members of the key.
1845
+ //!
1846
+ //! @param[in] begin_bit
1847
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1848
+ //! key comparison
1849
+ //!
1850
+ //! @param[in] end_bit
1851
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1852
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1853
+ //!
1854
+ //! @param[in] stream
1855
+ //! **[optional]** CUDA stream to launch kernels within.
1856
+ //! Default is stream<sub>0</sub>.
1857
+ template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
1858
+ CUB_RUNTIME_FUNCTION static //
1859
+ ::cuda::std::enable_if_t< //
1860
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
1861
+ cudaError_t>
1862
+ SortPairsDescending(
1863
+ void* d_temp_storage,
1864
+ size_t& temp_storage_bytes,
1865
+ DoubleBuffer<KeyT>& d_keys,
1866
+ DoubleBuffer<ValueT>& d_values,
1867
+ NumItemsT num_items,
1868
+ DecomposerT decomposer,
1869
+ int begin_bit,
1870
+ int end_bit,
1871
+ cudaStream_t stream = 0)
1872
+ {
1873
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1874
+
1875
+ // unsigned integer type for global offsets
1876
+ using offset_t = detail::choose_offset_t<NumItemsT>;
1877
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
1878
+
1879
+ static_assert(decomposer_check_t::value,
1880
+ "DecomposerT must be a callable object returning a tuple of references to "
1881
+ "arithmetic types");
1882
+
1883
+ constexpr bool is_overwrite_okay = true;
1884
+
1885
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
1886
+ decomposer_check_t{},
1887
+ d_temp_storage,
1888
+ temp_storage_bytes,
1889
+ is_overwrite_okay,
1890
+ d_keys,
1891
+ d_values,
1892
+ static_cast<offset_t>(num_items),
1893
+ decomposer,
1894
+ begin_bit,
1895
+ end_bit,
1896
+ stream);
1897
+ }
1898
+
1899
+ //@} end member group
1900
+ /******************************************************************/ /**
1901
+ * @name Keys-only
1902
+ *********************************************************************/
1903
+ //@{
1904
+
1905
+ //! @brief Sorts keys into ascending order.
1906
+ //! (`~2N` auxiliary storage required)
1907
+ //!
1908
+ //! @par
1909
+ //! - The contents of the input data are not altered by the sorting operation.
1910
+ //! - Pointers to contiguous memory must be used; iterators are not currently
1911
+ //! supported.
1912
+ //! - In-place operations are not supported. There must be no overlap between
1913
+ //! any of the provided ranges:
1914
+ //! - `[d_keys_in, d_keys_in + num_items)`
1915
+ //! - `[d_keys_out, d_keys_out + num_items)`
1916
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
1917
+ //! bits can be specified. This can reduce overall sorting overhead and
1918
+ //! yield a corresponding performance improvement.
1919
+ //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
1920
+ //! the sorting interface using DoubleBuffer wrappers below.
1921
+ //! - @devicestorage
1922
+ //!
1923
+ //! @par Performance
1924
+ //! The following charts illustrate saturated sorting performance across
1925
+ //! different CUDA architectures for uniform-random `uint32` and `uint64`
1926
+ //! keys, respectively.
1927
+ //!
1928
+ //! @image html lsb_radix_sort_int32_keys.png
1929
+ //! @image html lsb_radix_sort_int64_keys.png
1930
+ //!
1931
+ //! @par Snippet
1932
+ //! The code snippet below illustrates the sorting of a device vector of
1933
+ //! `int` keys.
1934
+ //! @par
1935
+ //! @code
1936
+ //! #include <cub/cub.cuh>
1937
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
1938
+ //!
1939
+ //! // Declare, allocate, and initialize device-accessible pointers
1940
+ //! // for sorting data
1941
+ //! int num_items; // e.g., 7
1942
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1943
+ //! int *d_keys_out; // e.g., [ ... ]
1944
+ //! ...
1945
+ //!
1946
+ //! // Determine temporary device storage requirements
1947
+ //! void *d_temp_storage = nullptr;
1948
+ //! size_t temp_storage_bytes = 0;
1949
+ //! cub::DeviceRadixSort::SortKeys(
1950
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1951
+ //!
1952
+ //! // Allocate temporary storage
1953
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1954
+ //!
1955
+ //! // Run sorting operation
1956
+ //! cub::DeviceRadixSort::SortKeys(
1957
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
1958
+ //!
1959
+ //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
1960
+ //! @endcode
1961
+ //!
1962
+ //! @tparam KeyT
1963
+ //! **[inferred]** KeyT type
1964
+ //!
1965
+ //! @tparam NumItemsT
1966
+ //! **[inferred]** Type of num_items
1967
+ //!
1968
+ //! @tparam NumItemsT
1969
+ //! **[inferred]** Type of num_items
1970
+ //!
1971
+ //! @param[in] d_temp_storage
1972
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1973
+ //! required allocation size is written to `temp_storage_bytes` and no work
1974
+ //! is done.
1975
+ //!
1976
+ //! @param[in,out] temp_storage_bytes
1977
+ //! Reference to size in bytes of `d_temp_storage` allocation
1978
+ //!
1979
+ //! @param[in] d_keys_in
1980
+ //! Pointer to the input data of key data to sort
1981
+ //!
1982
+ //! @param[out] d_keys_out
1983
+ //! Pointer to the sorted output sequence of key data
1984
+ //!
1985
+ //! @param[in] num_items
1986
+ //! Number of items to sort
1987
+ //!
1988
+ //! @param[in] begin_bit
1989
+ //! **[optional]** The least-significant bit index (inclusive) needed for
1990
+ //! key comparison
1991
+ //!
1992
+ //! @param[in] end_bit
1993
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
1994
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
1995
+ //!
1996
+ //! @param[in] stream
1997
+ //! **[optional]** CUDA stream to launch kernels within.
1998
+ //! Default is stream<sub>0</sub>.
1999
+ template <typename KeyT, typename NumItemsT>
2000
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2001
+ void* d_temp_storage,
2002
+ size_t& temp_storage_bytes,
2003
+ const KeyT* d_keys_in,
2004
+ KeyT* d_keys_out,
2005
+ NumItemsT num_items,
2006
+ int begin_bit = 0,
2007
+ int end_bit = sizeof(KeyT) * 8,
2008
+ cudaStream_t stream = 0)
2009
+ {
2010
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2011
+
2012
+ // Unsigned integer type for global offsets.
2013
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2014
+
2015
+ // We cast away const-ness, but will *not* write to these arrays.
2016
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
2017
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
2018
+ // is not set.
2019
+ constexpr bool is_overwrite_okay = false;
2020
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2021
+ // Null value type
2022
+ DoubleBuffer<NullType> d_values;
2023
+
2024
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2025
+ d_temp_storage,
2026
+ temp_storage_bytes,
2027
+ d_keys,
2028
+ d_values,
2029
+ static_cast<OffsetT>(num_items),
2030
+ begin_bit,
2031
+ end_bit,
2032
+ is_overwrite_okay,
2033
+ stream);
2034
+ }
2035
+
2036
+ //! @rst
2037
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2038
+ //!
2039
+ //! * The contents of the input data are not altered by the sorting operation.
2040
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2041
+ //! supported.
2042
+ //! * In-place operations are not supported. There must be no overlap between
2043
+ //! any of the provided ranges:
2044
+ //!
2045
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2046
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2047
+ //!
2048
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2049
+ //! differentiating key bits. This can reduce overall sorting overhead and
2050
+ //! yield a corresponding performance improvement.
2051
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2052
+ //! the sorting interface using DoubleBuffer wrappers below.
2053
+ //! * @devicestorage
2054
+ //!
2055
+ //! Snippet
2056
+ //! ==========================================================================
2057
+ //!
2058
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2059
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2060
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2061
+ //! tuple of references to relevant members of the key.
2062
+ //!
2063
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2064
+ //! :language: c++
2065
+ //! :dedent:
2066
+ //! :start-after: example-begin custom-type
2067
+ //! :end-before: example-end custom-type
2068
+ //!
2069
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2070
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2071
+ //!
2072
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2073
+ //! :language: c++
2074
+ //! :dedent:
2075
+ //! :start-after: example-begin keys-bits
2076
+ //! :end-before: example-end keys-bits
2077
+ //!
2078
+ //! @endrst
2079
+ //!
2080
+ //! @tparam KeyT
2081
+ //! **[inferred]** KeyT type
2082
+ //!
2083
+ //! @tparam NumItemsT
2084
+ //! **[inferred]** Type of num_items
2085
+ //!
2086
+ //! @tparam DecomposerT
2087
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2088
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2089
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2090
+ //! The leftmost element of the tuple is considered the most significant.
2091
+ //! The call operator must not modify members of the key.
2092
+ //!
2093
+ //! @param[in] d_temp_storage
2094
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2095
+ //! required allocation size is written to `temp_storage_bytes` and no work
2096
+ //! is done.
2097
+ //!
2098
+ //! @param[in,out] temp_storage_bytes
2099
+ //! Reference to size in bytes of `d_temp_storage` allocation
2100
+ //!
2101
+ //! @param[in] d_keys_in
2102
+ //! Pointer to the input data of key data to sort
2103
+ //!
2104
+ //! @param[out] d_keys_out
2105
+ //! Pointer to the sorted output sequence of key data
2106
+ //!
2107
+ //! @param[in] num_items
2108
+ //! Number of items to sort
2109
+ //!
2110
+ //! @param decomposer
2111
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2112
+ //! references to its constituent arithmetic types. The leftmost element of
2113
+ //! the tuple is considered the most significant. The call operator must not
2114
+ //! modify members of the key.
2115
+ //!
2116
+ //! @param[in] begin_bit
2117
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2118
+ //! key comparison
2119
+ //!
2120
+ //! @param[in] end_bit
2121
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2122
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2123
+ //!
2124
+ //! @param[in] stream
2125
+ //! **[optional]** CUDA stream to launch kernels within.
2126
+ //! Default is stream<sub>0</sub>.
2127
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2128
+ CUB_RUNTIME_FUNCTION static //
2129
+ ::cuda::std::enable_if_t< //
2130
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2131
+ cudaError_t>
2132
+ SortKeys(void* d_temp_storage,
2133
+ size_t& temp_storage_bytes,
2134
+ const KeyT* d_keys_in,
2135
+ KeyT* d_keys_out,
2136
+ NumItemsT num_items,
2137
+ DecomposerT decomposer,
2138
+ int begin_bit,
2139
+ int end_bit,
2140
+ cudaStream_t stream = 0)
2141
+ {
2142
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2143
+
2144
+ // unsigned integer type for global offsets
2145
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2146
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2147
+
2148
+ static_assert(decomposer_check_t::value,
2149
+ "DecomposerT must be a callable object returning a tuple of references to "
2150
+ "arithmetic types");
2151
+
2152
+ // We cast away const-ness, but will *not* write to these arrays.
2153
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
2154
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
2155
+ // is not set.
2156
+ constexpr bool is_overwrite_okay = false;
2157
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2158
+ DoubleBuffer<NullType> d_values;
2159
+
2160
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2161
+ decomposer_check_t{},
2162
+ d_temp_storage,
2163
+ temp_storage_bytes,
2164
+ is_overwrite_okay,
2165
+ d_keys,
2166
+ d_values,
2167
+ static_cast<offset_t>(num_items),
2168
+ decomposer,
2169
+ begin_bit,
2170
+ end_bit,
2171
+ stream);
2172
+ }
2173
+
2174
+ //! @rst
2175
+ //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
2176
+ //!
2177
+ //! * The contents of the input data are not altered by the sorting operation.
2178
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2179
+ //! supported.
2180
+ //! * In-place operations are not supported. There must be no overlap between
2181
+ //! any of the provided ranges:
2182
+ //!
2183
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2184
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2185
+ //!
2186
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2187
+ //! bits can be specified. This can reduce overall sorting overhead and
2188
+ //! yield a corresponding performance improvement.
2189
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2190
+ //! the sorting interface using DoubleBuffer wrappers below.
2191
+ //! * @devicestorage
2192
+ //!
2193
+ //! Snippet
2194
+ //! ==========================================================================
2195
+ //!
2196
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2197
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2198
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2199
+ //! tuple of references to relevant members of the key.
2200
+ //!
2201
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2202
+ //! :language: c++
2203
+ //! :dedent:
2204
+ //! :start-after: example-begin custom-type
2205
+ //! :end-before: example-end custom-type
2206
+ //!
2207
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2208
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2209
+ //!
2210
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2211
+ //! :language: c++
2212
+ //! :dedent:
2213
+ //! :start-after: example-begin keys
2214
+ //! :end-before: example-end keys
2215
+ //!
2216
+ //! @endrst
2217
+ //!
2218
+ //! @tparam KeyT
2219
+ //! **[inferred]** KeyT type
2220
+ //!
2221
+ //! @tparam NumItemsT
2222
+ //! **[inferred]** Type of num_items
2223
+ //!
2224
+ //! @tparam DecomposerT
2225
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2226
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2227
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2228
+ //! The leftmost element of the tuple is considered the most significant.
2229
+ //! The call operator must not modify members of the key.
2230
+ //!
2231
+ //! @param[in] d_temp_storage
2232
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2233
+ //! required allocation size is written to `temp_storage_bytes` and no work
2234
+ //! is done.
2235
+ //!
2236
+ //! @param[in,out] temp_storage_bytes
2237
+ //! Reference to size in bytes of `d_temp_storage` allocation
2238
+ //!
2239
+ //! @param[in] d_keys_in
2240
+ //! Pointer to the input data of key data to sort
2241
+ //!
2242
+ //! @param[out] d_keys_out
2243
+ //! Pointer to the sorted output sequence of key data
2244
+ //!
2245
+ //! @param[in] num_items
2246
+ //! Number of items to sort
2247
+ //!
2248
+ //! @param decomposer
2249
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2250
+ //! references to its constituent arithmetic types. The leftmost element of
2251
+ //! the tuple is considered the most significant. The call operator must not
2252
+ //! modify members of the key.
2253
+ //!
2254
+ //! @param[in] stream
2255
+ //! **[optional]** CUDA stream to launch kernels within.
2256
+ //! Default is stream<sub>0</sub>.
2257
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2258
+ CUB_RUNTIME_FUNCTION static //
2259
+ ::cuda::std::enable_if_t< //
2260
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2261
+ cudaError_t>
2262
+ SortKeys(void* d_temp_storage,
2263
+ size_t& temp_storage_bytes,
2264
+ const KeyT* d_keys_in,
2265
+ KeyT* d_keys_out,
2266
+ NumItemsT num_items,
2267
+ DecomposerT decomposer,
2268
+ cudaStream_t stream = 0)
2269
+ {
2270
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2271
+
2272
+ // unsigned integer type for global offsets
2273
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2274
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2275
+
2276
+ static_assert(decomposer_check_t::value,
2277
+ "DecomposerT must be a callable object returning a tuple of references to "
2278
+ "arithmetic types");
2279
+
2280
+ // We cast away const-ness, but will *not* write to these arrays.
2281
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
2282
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
2283
+ // is not set.
2284
+ constexpr bool is_overwrite_okay = false;
2285
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2286
+ DoubleBuffer<NullType> d_values;
2287
+
2288
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2289
+ decomposer_check_t{},
2290
+ d_temp_storage,
2291
+ temp_storage_bytes,
2292
+ is_overwrite_okay,
2293
+ d_keys,
2294
+ d_values,
2295
+ static_cast<offset_t>(num_items),
2296
+ decomposer,
2297
+ stream);
2298
+ }
2299
+
2300
+ //! @brief Sorts keys into ascending order. (`~N` auxiliary storage required).
2301
+ //!
2302
+ //! @par
2303
+ //! - The sorting operation is given a pair of key buffers managed by a
2304
+ //! DoubleBuffer structure that indicates which of the two buffers is
2305
+ //! "current" (and thus contains the input data to be sorted).
2306
+ //! - The contents of both buffers may be altered by the sorting operation.
2307
+ //! - In-place operations are not supported. There must be no overlap between
2308
+ //! any of the provided ranges:
2309
+ //! - `[d_keys.Current(), d_keys.Current() + num_items)`
2310
+ //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
2311
+ //! - Upon completion, the sorting operation will update the "current"
2312
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2313
+ //! buffers now contains the sorted output sequence (a function of the
2314
+ //! number of key bits specified and the targeted device architecture).
2315
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
2316
+ //! bits can be specified. This can reduce overall sorting overhead and
2317
+ //! yield a corresponding performance improvement.
2318
+ //! - @devicestorageP
2319
+ //! - @devicestorage
2320
+ //!
2321
+ //! @par Performance
2322
+ //! The following charts illustrate saturated sorting performance across
2323
+ //! different CUDA architectures for uniform-random `uint32` and `uint64`
2324
+ //! keys, respectively.
2325
+ //!
2326
+ //! @image html lsb_radix_sort_int32_keys.png
2327
+ //! @image html lsb_radix_sort_int64_keys.png
2328
+ //!
2329
+ //! @par Snippet
2330
+ //! The code snippet below illustrates the sorting of a device vector of
2331
+ //! `int` keys.
2332
+ //! @par
2333
+ //! @code
2334
+ //! #include <cub/cub.cuh>
2335
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2336
+ //!
2337
+ //! // Declare, allocate, and initialize device-accessible pointers
2338
+ //! // for sorting data
2339
+ //! int num_items; // e.g., 7
2340
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2341
+ //! int *d_key_alt_buf; // e.g., [ ... ]
2342
+ //! ...
2343
+ //!
2344
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2345
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2346
+ //!
2347
+ //! // Determine temporary device storage requirements
2348
+ //! void *d_temp_storage = nullptr;
2349
+ //! size_t temp_storage_bytes = 0;
2350
+ //! cub::DeviceRadixSort::SortKeys(
2351
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2352
+ //!
2353
+ //! // Allocate temporary storage
2354
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2355
+ //!
2356
+ //! // Run sorting operation
2357
+ //! cub::DeviceRadixSort::SortKeys(
2358
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
2359
+ //!
2360
+ //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
2361
+ //! @endcode
2362
+ //!
2363
+ //! @tparam KeyT
2364
+ //! **[inferred]** KeyT type
2365
+ //!
2366
+ //! @tparam NumItemsT
2367
+ //! **[inferred]** Type of num_items
2368
+ //!
2369
+ //! @param[in] d_temp_storage
2370
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2371
+ //! required allocation size is written to `temp_storage_bytes` and no work
2372
+ //! is done.
2373
+ //!
2374
+ //! @param[in,out] temp_storage_bytes
2375
+ //! Reference to size in bytes of `d_temp_storage` allocation
2376
+ //!
2377
+ //! @param[in,out] d_keys
2378
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2379
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2380
+ //! point to the sorted output keys
2381
+ //!
2382
+ //! @param[in] num_items
2383
+ //! Number of items to sort
2384
+ //!
2385
+ //! @param[in] begin_bit
2386
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2387
+ //! key comparison
2388
+ //!
2389
+ //! @param[in] end_bit
2390
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2391
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
2392
+ //!
2393
+ //! @param[in] stream
2394
+ //! **[optional]** CUDA stream to launch kernels within.
2395
+ //! Default is stream<sub>0</sub>.
2396
+ template <typename KeyT, typename NumItemsT>
2397
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
2398
+ void* d_temp_storage,
2399
+ size_t& temp_storage_bytes,
2400
+ DoubleBuffer<KeyT>& d_keys,
2401
+ NumItemsT num_items,
2402
+ int begin_bit = 0,
2403
+ int end_bit = sizeof(KeyT) * 8,
2404
+ cudaStream_t stream = 0)
2405
+ {
2406
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2407
+
2408
+ // Unsigned integer type for global offsets.
2409
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2410
+
2411
+ constexpr bool is_overwrite_okay = true;
2412
+
2413
+ // Null value type
2414
+ DoubleBuffer<NullType> d_values;
2415
+
2416
+ return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
2417
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2418
+ }
2419
+
2420
+ //! @rst
2421
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2422
+ //!
2423
+ //! * The sorting operation is given a pair of key buffers managed by a
2424
+ //! DoubleBuffer structure that indicates which of the two buffers is
2425
+ //! "current" (and thus contains the input data to be sorted).
2426
+ //! * The contents of both buffers may be altered by the sorting operation.
2427
+ //! * In-place operations are not supported. There must be no overlap between
2428
+ //! any of the provided ranges:
2429
+ //!
2430
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2431
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2432
+ //!
2433
+ //! * Upon completion, the sorting operation will update the "current"
2434
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2435
+ //! buffers now contains the sorted output sequence (a function of the
2436
+ //! number of key bits specified and the targeted device architecture).
2437
+ //! * @devicestorageP
2438
+ //! * @devicestorage
2439
+ //!
2440
+ //! Snippet
2441
+ //! ==========================================================================
2442
+ //!
2443
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2444
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2445
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2446
+ //! tuple of references to relevant members of the key.
2447
+ //!
2448
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2449
+ //! :language: c++
2450
+ //! :dedent:
2451
+ //! :start-after: example-begin custom-type
2452
+ //! :end-before: example-end custom-type
2453
+ //!
2454
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2455
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2456
+ //!
2457
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2458
+ //! :language: c++
2459
+ //! :dedent:
2460
+ //! :start-after: example-begin keys-db
2461
+ //! :end-before: example-end keys-db
2462
+ //!
2463
+ //! @endrst
2464
+ //!
2465
+ //! @tparam KeyT
2466
+ //! **[inferred]** KeyT type
2467
+ //!
2468
+ //! @tparam NumItemsT
2469
+ //! **[inferred]** Type of num_items
2470
+ //!
2471
+ //! @tparam DecomposerT
2472
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2473
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2474
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2475
+ //! The leftmost element of the tuple is considered the most significant.
2476
+ //! The call operator must not modify members of the key.
2477
+ //!
2478
+ //! @param[in] d_temp_storage
2479
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2480
+ //! required allocation size is written to `temp_storage_bytes` and no work
2481
+ //! is done.
2482
+ //!
2483
+ //! @param[in,out] temp_storage_bytes
2484
+ //! Reference to size in bytes of `d_temp_storage` allocation
2485
+ //!
2486
+ //! @param[in,out] d_keys
2487
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2488
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2489
+ //! point to the sorted output keys
2490
+ //!
2491
+ //! @param[in] num_items
2492
+ //! Number of items to sort
2493
+ //!
2494
+ //! @param decomposer
2495
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2496
+ //! references to its constituent arithmetic types. The leftmost element of
2497
+ //! the tuple is considered the most significant. The call operator must not
2498
+ //! modify members of the key.
2499
+ //!
2500
+ //! @param[in] stream
2501
+ //! **[optional]** CUDA stream to launch kernels within.
2502
+ //! Default is stream<sub>0</sub>.
2503
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2504
+ CUB_RUNTIME_FUNCTION static //
2505
+ ::cuda::std::enable_if_t< //
2506
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2507
+ cudaError_t>
2508
+ SortKeys(void* d_temp_storage,
2509
+ size_t& temp_storage_bytes,
2510
+ DoubleBuffer<KeyT>& d_keys,
2511
+ NumItemsT num_items,
2512
+ DecomposerT decomposer,
2513
+ cudaStream_t stream = 0)
2514
+ {
2515
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2516
+
2517
+ // unsigned integer type for global offsets
2518
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2519
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2520
+
2521
+ static_assert(decomposer_check_t::value,
2522
+ "DecomposerT must be a callable object returning a tuple of references to "
2523
+ "arithmetic types");
2524
+
2525
+ constexpr bool is_overwrite_okay = true;
2526
+ DoubleBuffer<NullType> d_values;
2527
+
2528
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2529
+ decomposer_check_t{},
2530
+ d_temp_storage,
2531
+ temp_storage_bytes,
2532
+ is_overwrite_okay,
2533
+ d_keys,
2534
+ d_values,
2535
+ static_cast<offset_t>(num_items),
2536
+ decomposer,
2537
+ stream);
2538
+ }
2539
+
2540
+ //! @rst
2541
+ //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
2542
+ //!
2543
+ //! * The sorting operation is given a pair of key buffers managed by a
2544
+ //! DoubleBuffer structure that indicates which of the two buffers is
2545
+ //! "current" (and thus contains the input data to be sorted).
2546
+ //! * The contents of both buffers may be altered by the sorting operation.
2547
+ //! * In-place operations are not supported. There must be no overlap between
2548
+ //! any of the provided ranges:
2549
+ //!
2550
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
2551
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
2552
+ //!
2553
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
2554
+ //! differentiating key bits. This can reduce overall sorting overhead and
2555
+ //! yield a corresponding performance improvement.
2556
+ //! * Upon completion, the sorting operation will update the "current"
2557
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
2558
+ //! buffers now contains the sorted output sequence (a function of the
2559
+ //! number of key bits specified and the targeted device architecture).
2560
+ //! * @devicestorageP
2561
+ //! * @devicestorage
2562
+ //!
2563
+ //! Snippet
2564
+ //! ==========================================================================
2565
+ //!
2566
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2567
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2568
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2569
+ //! tuple of references to relevant members of the key.
2570
+ //!
2571
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2572
+ //! :language: c++
2573
+ //! :dedent:
2574
+ //! :start-after: example-begin custom-type
2575
+ //! :end-before: example-end custom-type
2576
+ //!
2577
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2578
+ //! using ``cub::DeviceRadixSort::SortKeys``:
2579
+ //!
2580
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2581
+ //! :language: c++
2582
+ //! :dedent:
2583
+ //! :start-after: example-begin keys-bits-db
2584
+ //! :end-before: example-end keys-bits-db
2585
+ //!
2586
+ //! @endrst
2587
+ //!
2588
+ //! @tparam KeyT
2589
+ //! **[inferred]** KeyT type
2590
+ //!
2591
+ //! @tparam NumItemsT
2592
+ //! **[inferred]** Type of num_items
2593
+ //!
2594
+ //! @tparam DecomposerT
2595
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2596
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2597
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2598
+ //! The leftmost element of the tuple is considered the most significant.
2599
+ //! The call operator must not modify members of the key.
2600
+ //!
2601
+ //! @param[in] d_temp_storage
2602
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2603
+ //! required allocation size is written to `temp_storage_bytes` and no work
2604
+ //! is done.
2605
+ //!
2606
+ //! @param[in,out] temp_storage_bytes
2607
+ //! Reference to size in bytes of `d_temp_storage` allocation
2608
+ //!
2609
+ //! @param[in,out] d_keys
2610
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2611
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2612
+ //! point to the sorted output keys
2613
+ //!
2614
+ //! @param[in] num_items
2615
+ //! Number of items to sort
2616
+ //!
2617
+ //! @param decomposer
2618
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2619
+ //! references to its constituent arithmetic types. The leftmost element of
2620
+ //! the tuple is considered the most significant. The call operator must not
2621
+ //! modify members of the key.
2622
+ //!
2623
+ //! @param[in] begin_bit
2624
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2625
+ //! key comparison
2626
+ //!
2627
+ //! @param[in] end_bit
2628
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2629
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2630
+ //!
2631
+ //! @param[in] stream
2632
+ //! **[optional]** CUDA stream to launch kernels within.
2633
+ //! Default is stream<sub>0</sub>.
2634
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2635
+ CUB_RUNTIME_FUNCTION static //
2636
+ ::cuda::std::enable_if_t< //
2637
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2638
+ cudaError_t>
2639
+ SortKeys(void* d_temp_storage,
2640
+ size_t& temp_storage_bytes,
2641
+ DoubleBuffer<KeyT>& d_keys,
2642
+ NumItemsT num_items,
2643
+ DecomposerT decomposer,
2644
+ int begin_bit,
2645
+ int end_bit,
2646
+ cudaStream_t stream = 0)
2647
+ {
2648
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2649
+
2650
+ // unsigned integer type for global offsets
2651
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2652
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2653
+
2654
+ static_assert(decomposer_check_t::value,
2655
+ "DecomposerT must be a callable object returning a tuple of references to "
2656
+ "arithmetic types");
2657
+
2658
+ constexpr bool is_overwrite_okay = true;
2659
+ DoubleBuffer<NullType> d_values;
2660
+
2661
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
2662
+ decomposer_check_t{},
2663
+ d_temp_storage,
2664
+ temp_storage_bytes,
2665
+ is_overwrite_okay,
2666
+ d_keys,
2667
+ d_values,
2668
+ static_cast<offset_t>(num_items),
2669
+ decomposer,
2670
+ begin_bit,
2671
+ end_bit,
2672
+ stream);
2673
+ }
2674
+
2675
+ //! @brief Sorts keys into descending order.
2676
+ //! (`~2N` auxiliary storage required).
2677
+ //!
2678
+ //! @par
2679
+ //! - The contents of the input data are not altered by the sorting operation.
2680
+ //! - Pointers to contiguous memory must be used; iterators are not currently
2681
+ //! supported.
2682
+ //! - In-place operations are not supported. There must be no overlap between
2683
+ //! any of the provided ranges:
2684
+ //! - `[d_keys_in, d_keys_in + num_items)`
2685
+ //! - `[d_keys_out, d_keys_out + num_items)`
2686
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
2687
+ //! bits can be specified. This can reduce overall sorting overhead and
2688
+ //! yield a corresponding performance improvement.
2689
+ //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
2690
+ //! the sorting interface using DoubleBuffer wrappers below.
2691
+ //! - @devicestorage
2692
+ //!
2693
+ //! @par Performance
2694
+ //! Performance is similar to DeviceRadixSort::SortKeys.
2695
+ //!
2696
+ //! @par Snippet
2697
+ //! The code snippet below illustrates the sorting of a device vector of
2698
+ //! `int` keys.
2699
+ //! @par
2700
+ //! @code
2701
+ //! #include <cub/cub.cuh>
2702
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
2703
+ //!
2704
+ //! // Declare, allocate, and initialize device-accessible pointers
2705
+ //! // for sorting data
2706
+ //! int num_items; // e.g., 7
2707
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2708
+ //! int *d_keys_out; // e.g., [ ... ]
2709
+ //! ...
2710
+ //!
2711
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
2712
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2713
+ //!
2714
+ //! // Determine temporary device storage requirements
2715
+ //! void *d_temp_storage = nullptr;
2716
+ //! size_t temp_storage_bytes = 0;
2717
+ //! cub::DeviceRadixSort::SortKeysDescending(
2718
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2719
+ //!
2720
+ //! // Allocate temporary storage
2721
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2722
+ //!
2723
+ //! // Run sorting operation
2724
+ //! cub::DeviceRadixSort::SortKeysDescending(
2725
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
2726
+ //!
2727
+ //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
2728
+ //!
2729
+ //! @endcode
2730
+ //!
2731
+ //! @tparam KeyT
2732
+ //! **[inferred]** KeyT type
2733
+ //!
2734
+ //! @tparam NumItemsT
2735
+ //! **[inferred]** Type of num_items
2736
+ //!
2737
+ //! @param[in] d_temp_storage
2738
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2739
+ //! required allocation size is written to `temp_storage_bytes` and no work
2740
+ //! is done.
2741
+ //!
2742
+ //! @param[in,out] temp_storage_bytes
2743
+ //! Reference to size in bytes of `d_temp_storage` allocation
2744
+ //!
2745
+ //! @param[in] d_keys_in
2746
+ //! Pointer to the input data of key data to sort
2747
+ //!
2748
+ //! @param[out] d_keys_out
2749
+ //! Pointer to the sorted output sequence of key data
2750
+ //!
2751
+ //! @param[in] num_items
2752
+ //! Number of items to sort
2753
+ //!
2754
+ //! @param[in] begin_bit
2755
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2756
+ //! key comparison
2757
+ //!
2758
+ //! @param[in] end_bit
2759
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2760
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
2761
+ //!
2762
+ //! @param[in] stream
2763
+ //! **[optional]** CUDA stream to launch kernels within.
2764
+ //! Default is stream<sub>0</sub>.
2765
+ template <typename KeyT, typename NumItemsT>
2766
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
2767
+ void* d_temp_storage,
2768
+ size_t& temp_storage_bytes,
2769
+ const KeyT* d_keys_in,
2770
+ KeyT* d_keys_out,
2771
+ NumItemsT num_items,
2772
+ int begin_bit = 0,
2773
+ int end_bit = sizeof(KeyT) * 8,
2774
+ cudaStream_t stream = 0)
2775
+ {
2776
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2777
+
2778
+ // Unsigned integer type for global offsets.
2779
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
2780
+
2781
+ // We cast away const-ness, but will *not* write to these arrays.
2782
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
2783
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
2784
+ // is not set.
2785
+ constexpr bool is_overwrite_okay = false;
2786
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2787
+ DoubleBuffer<NullType> d_values;
2788
+
2789
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
2790
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
2791
+ }
2792
+
2793
+ //! @rst
2794
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2795
+ //!
2796
+ //! * The contents of the input data are not altered by the sorting operation.
2797
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2798
+ //! supported.
2799
+ //! * In-place operations are not supported. There must be no overlap between
2800
+ //! any of the provided ranges:
2801
+ //!
2802
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2803
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2804
+ //!
2805
+ //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
2806
+ //! bits can be specified. This can reduce overall sorting overhead and
2807
+ //! yield a corresponding performance improvement.
2808
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2809
+ //! the sorting interface using DoubleBuffer wrappers below.
2810
+ //! * @devicestorage
2811
+ //!
2812
+ //! Snippet
2813
+ //! ==========================================================================
2814
+ //!
2815
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2816
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2817
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2818
+ //! tuple of references to relevant members of the key.
2819
+ //!
2820
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2821
+ //! :language: c++
2822
+ //! :dedent:
2823
+ //! :start-after: example-begin custom-type
2824
+ //! :end-before: example-end custom-type
2825
+ //!
2826
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2827
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2828
+ //!
2829
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2830
+ //! :language: c++
2831
+ //! :dedent:
2832
+ //! :start-after: example-begin keys-descending-bits
2833
+ //! :end-before: example-end keys-descending-bits
2834
+ //!
2835
+ //! @endrst
2836
+ //!
2837
+ //! @tparam KeyT
2838
+ //! **[inferred]** KeyT type
2839
+ //!
2840
+ //! @tparam NumItemsT
2841
+ //! **[inferred]** Type of num_items
2842
+ //!
2843
+ //! @tparam DecomposerT
2844
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2845
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2846
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2847
+ //! The leftmost element of the tuple is considered the most significant.
2848
+ //! The call operator must not modify members of the key.
2849
+ //!
2850
+ //! @param[in] d_temp_storage
2851
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2852
+ //! required allocation size is written to `temp_storage_bytes` and no work
2853
+ //! is done.
2854
+ //!
2855
+ //! @param[in,out] temp_storage_bytes
2856
+ //! Reference to size in bytes of `d_temp_storage` allocation
2857
+ //!
2858
+ //! @param[in] d_keys_in
2859
+ //! Pointer to the input data of key data to sort
2860
+ //!
2861
+ //! @param[out] d_keys_out
2862
+ //! Pointer to the sorted output sequence of key data
2863
+ //!
2864
+ //! @param[in] num_items
2865
+ //! Number of items to sort
2866
+ //!
2867
+ //! @param decomposer
2868
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2869
+ //! references to its constituent arithmetic types. The leftmost element of
2870
+ //! the tuple is considered the most significant. The call operator must not
2871
+ //! modify members of the key.
2872
+ //!
2873
+ //! @param[in] begin_bit
2874
+ //! **[optional]** The least-significant bit index (inclusive) needed for
2875
+ //! key comparison
2876
+ //!
2877
+ //! @param[in] end_bit
2878
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
2879
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2880
+ //!
2881
+ //! @param[in] stream
2882
+ //! **[optional]** CUDA stream to launch kernels within.
2883
+ //! Default is stream<sub>0</sub>.
2884
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
2885
+ CUB_RUNTIME_FUNCTION static //
2886
+ ::cuda::std::enable_if_t< //
2887
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
2888
+ cudaError_t>
2889
+ SortKeysDescending(
2890
+ void* d_temp_storage,
2891
+ size_t& temp_storage_bytes,
2892
+ const KeyT* d_keys_in,
2893
+ KeyT* d_keys_out,
2894
+ NumItemsT num_items,
2895
+ DecomposerT decomposer,
2896
+ int begin_bit,
2897
+ int end_bit,
2898
+ cudaStream_t stream = 0)
2899
+ {
2900
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2901
+
2902
+ // unsigned integer type for global offsets
2903
+ using offset_t = detail::choose_offset_t<NumItemsT>;
2904
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
2905
+
2906
+ static_assert(decomposer_check_t::value,
2907
+ "DecomposerT must be a callable object returning a tuple of references to "
2908
+ "arithmetic types");
2909
+
2910
+ // We cast away const-ness, but will *not* write to these arrays.
2911
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
2912
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
2913
+ // is not set.
2914
+ constexpr bool is_overwrite_okay = false;
2915
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
2916
+ DoubleBuffer<NullType> d_values;
2917
+
2918
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
2919
+ decomposer_check_t{},
2920
+ d_temp_storage,
2921
+ temp_storage_bytes,
2922
+ is_overwrite_okay,
2923
+ d_keys,
2924
+ d_values,
2925
+ static_cast<offset_t>(num_items),
2926
+ decomposer,
2927
+ begin_bit,
2928
+ end_bit,
2929
+ stream);
2930
+ }
2931
+
2932
+ //! @rst
2933
+ //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
2934
+ //!
2935
+ //! * The contents of the input data are not altered by the sorting operation.
2936
+ //! * Pointers to contiguous memory must be used; iterators are not currently
2937
+ //! supported.
2938
+ //! * In-place operations are not supported. There must be no overlap between
2939
+ //! any of the provided ranges:
2940
+ //!
2941
+ //! * ``[d_keys_in, d_keys_in + num_items)``
2942
+ //! * ``[d_keys_out, d_keys_out + num_items)``
2943
+ //!
2944
+ //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
2945
+ //! the sorting interface using DoubleBuffer wrappers below.
2946
+ //! * @devicestorage
2947
+ //!
2948
+ //! Snippet
2949
+ //! ==========================================================================
2950
+ //!
2951
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2952
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2953
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2954
+ //! tuple of references to relevant members of the key.
2955
+ //!
2956
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2957
+ //! :language: c++
2958
+ //! :dedent:
2959
+ //! :start-after: example-begin custom-type
2960
+ //! :end-before: example-end custom-type
2961
+ //!
2962
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
2963
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
2964
+ //!
2965
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
2966
+ //! :language: c++
2967
+ //! :dedent:
2968
+ //! :start-after: example-begin keys-descending
2969
+ //! :end-before: example-end keys-descending
2970
+ //!
2971
+ //! @endrst
2972
+ //!
2973
+ //! @tparam KeyT
2974
+ //! **[inferred]** KeyT type
2975
+ //!
2976
+ //! @tparam NumItemsT
2977
+ //! **[inferred]** Type of num_items
2978
+ //!
2979
+ //! @tparam DecomposerT
2980
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2981
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2982
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2983
+ //! The leftmost element of the tuple is considered the most significant.
2984
+ //! The call operator must not modify members of the key.
2985
+ //!
2986
+ //! @param[in] d_temp_storage
2987
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2988
+ //! required allocation size is written to `temp_storage_bytes` and no work
2989
+ //! is done.
2990
+ //!
2991
+ //! @param[in,out] temp_storage_bytes
2992
+ //! Reference to size in bytes of `d_temp_storage` allocation
2993
+ //!
2994
+ //! @param[in] d_keys_in
2995
+ //! Pointer to the input data of key data to sort
2996
+ //!
2997
+ //! @param[out] d_keys_out
2998
+ //! Pointer to the sorted output sequence of key data
2999
+ //!
3000
+ //! @param[in] num_items
3001
+ //! Number of items to sort
3002
+ //!
3003
+ //! @param decomposer
3004
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3005
+ //! references to its constituent arithmetic types. The leftmost element of
3006
+ //! the tuple is considered the most significant. The call operator must not
3007
+ //! modify members of the key.
3008
+ //!
3009
+ //! @param[in] stream
3010
+ //! **[optional]** CUDA stream to launch kernels within.
3011
+ //! Default is stream<sub>0</sub>.
3012
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3013
+ CUB_RUNTIME_FUNCTION static //
3014
+ ::cuda::std::enable_if_t< //
3015
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3016
+ cudaError_t>
3017
+ SortKeysDescending(
3018
+ void* d_temp_storage,
3019
+ size_t& temp_storage_bytes,
3020
+ const KeyT* d_keys_in,
3021
+ KeyT* d_keys_out,
3022
+ NumItemsT num_items,
3023
+ DecomposerT decomposer,
3024
+ cudaStream_t stream = 0)
3025
+ {
3026
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3027
+
3028
+ // unsigned integer type for global offsets
3029
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3030
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3031
+
3032
+ static_assert(decomposer_check_t::value,
3033
+ "DecomposerT must be a callable object returning a tuple of references to "
3034
+ "arithmetic types");
3035
+
3036
+ // We cast away const-ness, but will *not* write to these arrays.
3037
+ // `DispatchRadixSort::Dispatch` will allocate temporary storage and
3038
+ // create a new double-buffer internally when the `is_overwrite_ok` flag
3039
+ // is not set.
3040
+ constexpr bool is_overwrite_okay = false;
3041
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
3042
+ DoubleBuffer<NullType> d_values;
3043
+
3044
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3045
+ decomposer_check_t{},
3046
+ d_temp_storage,
3047
+ temp_storage_bytes,
3048
+ is_overwrite_okay,
3049
+ d_keys,
3050
+ d_values,
3051
+ static_cast<offset_t>(num_items),
3052
+ decomposer,
3053
+ stream);
3054
+ }
3055
+
3056
+ //! @brief Sorts keys into descending order.
3057
+ //! (`~N` auxiliary storage required).
3058
+ //!
3059
+ //! @par
3060
+ //! - The sorting operation is given a pair of key buffers managed by a
3061
+ //! DoubleBuffer structure that indicates which of the two buffers is
3062
+ //! "current" (and thus contains the input data to be sorted).
3063
+ //! - The contents of both buffers may be altered by the sorting operation.
3064
+ //! - In-place operations are not supported. There must be no overlap between
3065
+ //! any of the provided ranges:
3066
+ //! - `[d_keys.Current(), d_keys.Current() + num_items)`
3067
+ //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)`
3068
+ //! - Upon completion, the sorting operation will update the "current"
3069
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3070
+ //! buffers now contains the sorted output sequence (a function of the
3071
+ //! number of key bits specified and the targeted device architecture).
3072
+ //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
3073
+ //! bits can be specified. This can reduce overall sorting overhead and
3074
+ //! yield a corresponding performance improvement.
3075
+ //! - @devicestorageP
3076
+ //! - @devicestorage
3077
+ //!
3078
+ //! @par Performance
3079
+ //! Performance is similar to DeviceRadixSort::SortKeys.
3080
+ //!
3081
+ //! @par Snippet
3082
+ //! The code snippet below illustrates the sorting of a device vector of `i`nt keys.
3083
+ //! @par
3084
+ //! @code
3085
+ //! #include <cub/cub.cuh>
3086
+ //! // or equivalently <cub/device/device_radix_sort.cuh>
3087
+ //!
3088
+ //! // Declare, allocate, and initialize device-accessible pointers
3089
+ //! // for sorting data
3090
+ //! int num_items; // e.g., 7
3091
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
3092
+ //! int *d_key_alt_buf; // e.g., [ ... ]
3093
+ //! ...
3094
+ //!
3095
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
3096
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
3097
+ //!
3098
+ //! // Determine temporary device storage requirements
3099
+ //! void *d_temp_storage = nullptr;
3100
+ //! size_t temp_storage_bytes = 0;
3101
+ //! cub::DeviceRadixSort::SortKeysDescending(
3102
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3103
+ //!
3104
+ //! // Allocate temporary storage
3105
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
3106
+ //!
3107
+ //! // Run sorting operation
3108
+ //! cub::DeviceRadixSort::SortKeysDescending(
3109
+ //! d_temp_storage, temp_storage_bytes, d_keys, num_items);
3110
+ //!
3111
+ //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
3112
+ //! @endcode
3113
+ //!
3114
+ //! @tparam KeyT
3115
+ //! **[inferred]** KeyT type
3116
+ //!
3117
+ //! @tparam NumItemsT
3118
+ //! **[inferred]** Type of num_items
3119
+ //!
3120
+ //! @param[in] d_temp_storage
3121
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
3122
+ //! required allocation size is written to `temp_storage_bytes` and no work
3123
+ //! is done.
3124
+ //!
3125
+ //! @param[in,out] temp_storage_bytes
3126
+ //! Reference to size in bytes of `d_temp_storage` allocation
3127
+ //!
3128
+ //! @param[in,out] d_keys
3129
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3130
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3131
+ //! point to the sorted output keys
3132
+ //!
3133
+ //! @param[in] num_items
3134
+ //! Number of items to sort
3135
+ //!
3136
+ //! @param[in] begin_bit
3137
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3138
+ //! key comparison
3139
+ //!
3140
+ //! @param[in] end_bit
3141
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3142
+ //! comparison (e.g., `sizeof(unsigned int) * 8`)
3143
+ //!
3144
+ //! @param[in] stream
3145
+ //! **[optional]** CUDA stream to launch kernels within.
3146
+ //! Default is stream<sub>0</sub>.
3147
+ template <typename KeyT, typename NumItemsT>
3148
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
3149
+ void* d_temp_storage,
3150
+ size_t& temp_storage_bytes,
3151
+ DoubleBuffer<KeyT>& d_keys,
3152
+ NumItemsT num_items,
3153
+ int begin_bit = 0,
3154
+ int end_bit = sizeof(KeyT) * 8,
3155
+ cudaStream_t stream = 0)
3156
+ {
3157
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3158
+
3159
+ // Unsigned integer type for global offsets.
3160
+ using OffsetT = detail::choose_offset_t<NumItemsT>;
3161
+
3162
+ constexpr bool is_overwrite_okay = true;
3163
+
3164
+ // Null value type
3165
+ DoubleBuffer<NullType> d_values;
3166
+
3167
+ return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
3168
+ d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
3169
+ }
3170
+
3171
+ //! @rst
3172
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3173
+ //!
3174
+ //! * The sorting operation is given a pair of key buffers managed by a
3175
+ //! DoubleBuffer structure that indicates which of the two buffers is
3176
+ //! "current" (and thus contains the input data to be sorted).
3177
+ //! * The contents of both buffers may be altered by the sorting operation.
3178
+ //! * In-place operations are not supported. There must be no overlap between
3179
+ //! any of the provided ranges:
3180
+ //!
3181
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3182
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3183
+ //!
3184
+ //! * Upon completion, the sorting operation will update the "current"
3185
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3186
+ //! buffers now contains the sorted output sequence (a function of the
3187
+ //! number of key bits specified and the targeted device architecture).
3188
+ //! * @devicestorageP
3189
+ //! * @devicestorage
3190
+ //!
3191
+ //! Snippet
3192
+ //! ==========================================================================
3193
+ //!
3194
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3195
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3196
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3197
+ //! tuple of references to relevant members of the key.
3198
+ //!
3199
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3200
+ //! :language: c++
3201
+ //! :dedent:
3202
+ //! :start-after: example-begin custom-type
3203
+ //! :end-before: example-end custom-type
3204
+ //!
3205
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3206
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3207
+ //!
3208
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3209
+ //! :language: c++
3210
+ //! :dedent:
3211
+ //! :start-after: example-begin keys-descending-db
3212
+ //! :end-before: example-end keys-descending-db
3213
+ //!
3214
+ //! @endrst
3215
+ //!
3216
+ //! @tparam KeyT
3217
+ //! **[inferred]** KeyT type
3218
+ //!
3219
+ //! @tparam NumItemsT
3220
+ //! **[inferred]** Type of num_items
3221
+ //!
3222
+ //! @tparam DecomposerT
3223
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3224
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3225
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3226
+ //! The leftmost element of the tuple is considered the most significant.
3227
+ //! The call operator must not modify members of the key.
3228
+ //!
3229
+ //! @param[in] d_temp_storage
3230
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
3231
+ //! required allocation size is written to `temp_storage_bytes` and no work
3232
+ //! is done.
3233
+ //!
3234
+ //! @param[in,out] temp_storage_bytes
3235
+ //! Reference to size in bytes of `d_temp_storage` allocation
3236
+ //!
3237
+ //! @param[in,out] d_keys
3238
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3239
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3240
+ //! point to the sorted output keys
3241
+ //!
3242
+ //! @param[in] num_items
3243
+ //! Number of items to sort
3244
+ //!
3245
+ //! @param decomposer
3246
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3247
+ //! references to its constituent arithmetic types. The leftmost element of
3248
+ //! the tuple is considered the most significant. The call operator must not
3249
+ //! modify members of the key.
3250
+ //!
3251
+ //! @param[in] stream
3252
+ //! **[optional]** CUDA stream to launch kernels within.
3253
+ //! Default is stream<sub>0</sub>.
3254
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3255
+ CUB_RUNTIME_FUNCTION static //
3256
+ ::cuda::std::enable_if_t< //
3257
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3258
+ cudaError_t>
3259
+ SortKeysDescending(
3260
+ void* d_temp_storage,
3261
+ size_t& temp_storage_bytes,
3262
+ DoubleBuffer<KeyT>& d_keys,
3263
+ NumItemsT num_items,
3264
+ DecomposerT decomposer,
3265
+ cudaStream_t stream = 0)
3266
+ {
3267
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3268
+
3269
+ // unsigned integer type for global offsets
3270
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3271
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3272
+
3273
+ static_assert(decomposer_check_t::value,
3274
+ "DecomposerT must be a callable object returning a tuple of references to "
3275
+ "arithmetic types");
3276
+
3277
+ constexpr bool is_overwrite_okay = true;
3278
+ DoubleBuffer<NullType> d_values;
3279
+
3280
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3281
+ decomposer_check_t{},
3282
+ d_temp_storage,
3283
+ temp_storage_bytes,
3284
+ is_overwrite_okay,
3285
+ d_keys,
3286
+ d_values,
3287
+ static_cast<offset_t>(num_items),
3288
+ decomposer,
3289
+ stream);
3290
+ }
3291
+
3292
+ //! @rst
3293
+ //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
3294
+ //!
3295
+ //! * The sorting operation is given a pair of key buffers managed by a
3296
+ //! DoubleBuffer structure that indicates which of the two buffers is
3297
+ //! "current" (and thus contains the input data to be sorted).
3298
+ //! * The contents of both buffers may be altered by the sorting operation.
3299
+ //! * In-place operations are not supported. There must be no overlap between
3300
+ //! any of the provided ranges:
3301
+ //!
3302
+ //! * ``[d_keys.Current(), d_keys.Current() + num_items)``
3303
+ //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
3304
+ //!
3305
+ //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
3306
+ //! differentiating key bits. This can reduce overall sorting overhead and
3307
+ //! yield a corresponding performance improvement.
3308
+ //! * Upon completion, the sorting operation will update the "current"
3309
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
3310
+ //! buffers now contains the sorted output sequence (a function of the
3311
+ //! number of key bits specified and the targeted device architecture).
3312
+ //! * @devicestorageP
3313
+ //! * @devicestorage
3314
+ //!
3315
+ //! Snippet
3316
+ //! ==========================================================================
3317
+ //!
3318
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
3319
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
3320
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
3321
+ //! tuple of references to relevant members of the key.
3322
+ //!
3323
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3324
+ //! :language: c++
3325
+ //! :dedent:
3326
+ //! :start-after: example-begin custom-type
3327
+ //! :end-before: example-end custom-type
3328
+ //!
3329
+ //! The following snippet shows how to sort an array of ``custom_t`` objects
3330
+ //! using ``cub::DeviceRadixSort::SortKeysDescending``:
3331
+ //!
3332
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
3333
+ //! :language: c++
3334
+ //! :dedent:
3335
+ //! :start-after: example-begin keys-descending-bits-db
3336
+ //! :end-before: example-end keys-descending-bits-db
3337
+ //!
3338
+ //! @endrst
3339
+ //!
3340
+ //! @tparam KeyT
3341
+ //! **[inferred]** KeyT type
3342
+ //!
3343
+ //! @tparam NumItemsT
3344
+ //! **[inferred]** Type of num_items
3345
+ //!
3346
+ //! @tparam DecomposerT
3347
+ //! **[inferred]** Type of a callable object responsible for decomposing a
3348
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
3349
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
3350
+ //! The leftmost element of the tuple is considered the most significant.
3351
+ //! The call operator must not modify members of the key.
3352
+ //!
3353
+ //! @param[in] d_temp_storage
3354
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
3355
+ //! required allocation size is written to `temp_storage_bytes` and no work
3356
+ //! is done.
3357
+ //!
3358
+ //! @param[in,out] temp_storage_bytes
3359
+ //! Reference to size in bytes of `d_temp_storage` allocation
3360
+ //!
3361
+ //! @param[in,out] d_keys
3362
+ //! Reference to the double-buffer of keys whose "current" device-accessible
3363
+ //! buffer contains the unsorted input keys and, upon return, is updated to
3364
+ //! point to the sorted output keys
3365
+ //!
3366
+ //! @param[in] num_items
3367
+ //! Number of items to sort
3368
+ //!
3369
+ //! @param decomposer
3370
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
3371
+ //! references to its constituent arithmetic types. The leftmost element of
3372
+ //! the tuple is considered the most significant. The call operator must not
3373
+ //! modify members of the key.
3374
+ //!
3375
+ //! @param[in] begin_bit
3376
+ //! **[optional]** The least-significant bit index (inclusive) needed for
3377
+ //! key comparison
3378
+ //!
3379
+ //! @param[in] end_bit
3380
+ //! **[optional]** The most-significant bit index (exclusive) needed for key
3381
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
3382
+ //!
3383
+ //! @param[in] stream
3384
+ //! **[optional]** CUDA stream to launch kernels within.
3385
+ //! Default is stream<sub>0</sub>.
3386
+ template <typename KeyT, typename NumItemsT, typename DecomposerT>
3387
+ CUB_RUNTIME_FUNCTION static //
3388
+ ::cuda::std::enable_if_t< //
3389
+ !::cuda::std::is_convertible_v<DecomposerT, int>, //
3390
+ cudaError_t>
3391
+ SortKeysDescending(
3392
+ void* d_temp_storage,
3393
+ size_t& temp_storage_bytes,
3394
+ DoubleBuffer<KeyT>& d_keys,
3395
+ NumItemsT num_items,
3396
+ DecomposerT decomposer,
3397
+ int begin_bit,
3398
+ int end_bit,
3399
+ cudaStream_t stream = 0)
3400
+ {
3401
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
3402
+
3403
+ // unsigned integer type for global offsets
3404
+ using offset_t = detail::choose_offset_t<NumItemsT>;
3405
+ using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
3406
+
3407
+ static_assert(decomposer_check_t::value,
3408
+ "DecomposerT must be a callable object returning a tuple of references to "
3409
+ "arithmetic types");
3410
+
3411
+ constexpr bool is_overwrite_okay = true;
3412
+ DoubleBuffer<NullType> d_values;
3413
+
3414
+ return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
3415
+ decomposer_check_t{},
3416
+ d_temp_storage,
3417
+ temp_storage_bytes,
3418
+ is_overwrite_okay,
3419
+ d_keys,
3420
+ d_values,
3421
+ static_cast<offset_t>(num_items),
3422
+ decomposer,
3423
+ begin_bit,
3424
+ end_bit,
3425
+ stream);
3426
+ }
3427
+
3428
+ //! @} end member group
3429
+ };
3430
+
3431
+ CUB_NAMESPACE_END