cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2787 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3
3
+
4
+ //! @file
5
+ //! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple,
6
+ //! non-overlapping sequences of data items residing within device-accessible memory.
7
+
8
+ #pragma once
9
+
10
+ #include <cub/config.cuh>
11
+
12
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
13
+ # pragma GCC system_header
14
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
15
+ # pragma clang system_header
16
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
17
+ # pragma system_header
18
+ #endif // no system header
19
+
20
+ #include <cub/detail/choose_offset.cuh>
21
+ #include <cub/device/dispatch/dispatch_segmented_sort.cuh>
22
+ #include <cub/util_namespace.cuh>
23
+
24
+ #include <cuda/std/cstdint>
25
+
26
+ CUB_NAMESPACE_BEGIN
27
+
28
+ //! @rst
29
+ //! DeviceSegmentedSort provides device-wide, parallel operations for
30
+ //! computing a batched sort across multiple, non-overlapping sequences of
31
+ //! data items residing within device-accessible memory.
32
+ //!
33
+ //! Overview
34
+ //! +++++++++++++++++++++++++++++++++++++++++++++
35
+ //!
36
+ //! The algorithm arranges items into ascending (or descending) order.
37
+ //! The underlying sorting algorithm is undefined. Depending on the segment size,
38
+ //! it might be radix sort, merge sort or something else. Therefore, no
39
+ //! assumptions on the underlying implementation should be made.
40
+ //!
41
+ //! Differences from DeviceSegmentedRadixSort
42
+ //! +++++++++++++++++++++++++++++++++++++++++++++
43
+ //!
44
+ //! DeviceSegmentedRadixSort is optimized for significantly large segments (tens
45
+ //! of thousands of items and more). Nevertheless, some domains produce a wide
46
+ //! range of segment sizes. DeviceSegmentedSort partitions segments into size
47
+ //! groups and specialize sorting algorithms for each group. This approach leads
48
+ //! to better resource utilization in the presence of segment size imbalance or
49
+ //! moderate segment sizes (up to thousands of items).
50
+ //! This algorithm is more complex and consists of multiple kernels. This fact
51
+ //! leads to longer compilation times as well as larger binaries sizes.
52
+ //!
53
+ //! Supported Types
54
+ //! +++++++++++++++++++++++++++++++++++++++++++++
55
+ //!
56
+ //! The algorithm has to satisfy the underlying algorithms restrictions. Radix
57
+ //! sort usage restricts the list of supported types. Therefore,
58
+ //! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
59
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and
60
+ //! ``__nv_bfloat16`` 16-bit floating-point types.
61
+ //!
62
+ //! Segments are not required to be contiguous. Any element of input(s) or
63
+ //! output(s) outside the specified segments will not be accessed nor modified.
64
+ //!
65
+ //! A simple example
66
+ //! +++++++++++++++++++++++++++++++++++++++++++++
67
+ //!
68
+ //! .. code-block:: c++
69
+ //!
70
+ //! #include <cub/cub.cuh>
71
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
72
+ //!
73
+ //! // Declare, allocate, and initialize device-accessible pointers
74
+ //! // for sorting data
75
+ //! int num_items; // e.g., 7
76
+ //! int num_segments; // e.g., 3
77
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
78
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
79
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
80
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
81
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
82
+ //! ...
83
+ //!
84
+ //! // Determine temporary device storage requirements
85
+ //! void *d_temp_storage = nullptr;
86
+ //! size_t temp_storage_bytes = 0;
87
+ //! cub::DeviceSegmentedSort::SortPairs(
88
+ //! d_temp_storage, temp_storage_bytes,
89
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
90
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
91
+ //!
92
+ //! // Allocate temporary storage
93
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
94
+ //!
95
+ //! // Run sorting operation
96
+ //! cub::DeviceSegmentedSort::SortPairs(
97
+ //! d_temp_storage, temp_storage_bytes,
98
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
99
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
100
+ //!
101
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
102
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
103
+ //!
104
+ //! @endrst
105
+ struct DeviceSegmentedSort
106
+ {
107
+ private:
108
+ // Name reported for NVTX ranges
109
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
110
+ {
111
+ return "cub::DeviceSegmentedSort";
112
+ }
113
+
114
+ // Internal version without NVTX range
115
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
116
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
117
+ void* d_temp_storage,
118
+ size_t& temp_storage_bytes,
119
+ const KeyT* d_keys_in,
120
+ KeyT* d_keys_out,
121
+ ::cuda::std::int64_t num_items,
122
+ ::cuda::std::int64_t num_segments,
123
+ BeginOffsetIteratorT d_begin_offsets,
124
+ EndOffsetIteratorT d_end_offsets,
125
+ cudaStream_t stream = 0)
126
+ {
127
+ constexpr bool is_overwrite_okay = false;
128
+
129
+ using OffsetT =
130
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
131
+ using DispatchT =
132
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
133
+
134
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
135
+ DoubleBuffer<NullType> d_values;
136
+
137
+ return DispatchT::Dispatch(
138
+ d_temp_storage,
139
+ temp_storage_bytes,
140
+ d_keys,
141
+ d_values,
142
+ num_items,
143
+ num_segments,
144
+ d_begin_offsets,
145
+ d_end_offsets,
146
+ is_overwrite_okay,
147
+ stream);
148
+ }
149
+
150
+ public:
151
+ //! @name Keys-only
152
+ //! @{
153
+
154
+ //! @rst
155
+ //! Sorts segments of keys into ascending order.
156
+ //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
157
+ //!
158
+ //! - The contents of the input data are not altered by the sorting operation.
159
+ //! - When the input is a contiguous sequence of segments, a single sequence
160
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
161
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
162
+ //! the latter is specified as `segment_offsets+1`).
163
+ //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and
164
+ //! ``j`` are equivalent: neither one is less than the other. It is not
165
+ //! guaranteed that the relative order of these two elements will be
166
+ //! preserved by sort.
167
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
168
+ //! ``[d_keys_in, d_keys_in + num_items)``,
169
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
170
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
171
+ //! - Segments are not required to be contiguous. For all index values ``i``
172
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
173
+ //! be accessed nor modified.
174
+ //!
175
+ //! Snippet
176
+ //! +++++++++++++++++++++++++++++++++++++++++++++
177
+ //!
178
+ //! The code snippet below illustrates the batched sorting of three segments
179
+ //! (with one zero-length segment) of ``int`` keys.
180
+ //!
181
+ //! .. code-block:: c++
182
+ //!
183
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
184
+ //!
185
+ //! // Declare, allocate, and initialize device-accessible
186
+ //! // pointers for sorting data
187
+ //! int num_items; // e.g., 7
188
+ //! int num_segments; // e.g., 3
189
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
190
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
191
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
192
+ //! ...
193
+ //!
194
+ //! // Determine temporary device storage requirements
195
+ //! void *d_temp_storage = nullptr;
196
+ //! size_t temp_storage_bytes = 0;
197
+ //! cub::DeviceSegmentedSort::SortKeys(
198
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
199
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
200
+ //!
201
+ //! // Allocate temporary storage
202
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
203
+ //!
204
+ //! // Run sorting operation
205
+ //! cub::DeviceSegmentedSort::SortKeys(
206
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
207
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
208
+ //!
209
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
210
+ //!
211
+ //! @endrst
212
+ //!
213
+ //! @tparam KeyT
214
+ //! **[inferred]** Key type
215
+ //!
216
+ //! @tparam BeginOffsetIteratorT
217
+ //! **[inferred]** Random-access input iterator type for reading segment
218
+ //! beginning offsets @iterator
219
+ //!
220
+ //! @tparam EndOffsetIteratorT
221
+ //! **[inferred]** Random-access input iterator type for reading segment
222
+ //! ending offsets @iterator
223
+ //!
224
+ //! @param[in] d_temp_storage
225
+ //! Device-accessible allocation of temporary storage. When nullptr, the
226
+ //! required allocation size is written to `temp_storage_bytes` and no work
227
+ //! is done
228
+ //!
229
+ //! @param[in,out] temp_storage_bytes
230
+ //! Reference to size in bytes of `d_temp_storage` allocation
231
+ //!
232
+ //! @param[in] d_keys_in
233
+ //! Device-accessible pointer to the input data of key data to sort
234
+ //!
235
+ //! @param[out] d_keys_out
236
+ //! Device-accessible pointer to the sorted output sequence of key data
237
+ //!
238
+ //! @param[in] num_items
239
+ //! The total number of items to sort (across all segments)
240
+ //!
241
+ //! @param[in] num_segments
242
+ //! The number of segments that comprise the sorting data
243
+ //!
244
+ //! @param[in] d_begin_offsets
245
+ //! @rst
246
+ //! Random-access input iterator to the sequence of beginning offsets of
247
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
248
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
249
+ //! @endrst
250
+ //!
251
+ //! @param[in] d_end_offsets
252
+ //! @rst
253
+ //! Random-access input iterator to the sequence of ending offsets of length
254
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
255
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
256
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
257
+ //! @endrst
258
+ //!
259
+ //! @param[in] stream
260
+ //! @rst
261
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
262
+ //! @endrst
263
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
264
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
265
+ void* d_temp_storage,
266
+ size_t& temp_storage_bytes,
267
+ const KeyT* d_keys_in,
268
+ KeyT* d_keys_out,
269
+ ::cuda::std::int64_t num_items,
270
+ ::cuda::std::int64_t num_segments,
271
+ BeginOffsetIteratorT d_begin_offsets,
272
+ EndOffsetIteratorT d_end_offsets,
273
+ cudaStream_t stream = 0)
274
+ {
275
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
276
+ return SortKeysNoNVTX(
277
+ d_temp_storage,
278
+ temp_storage_bytes,
279
+ d_keys_in,
280
+ d_keys_out,
281
+ num_items,
282
+ num_segments,
283
+ d_begin_offsets,
284
+ d_end_offsets,
285
+ stream);
286
+ }
287
+
288
+ private:
289
+ // Internal version without NVTX range
290
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
291
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
292
+ void* d_temp_storage,
293
+ size_t& temp_storage_bytes,
294
+ const KeyT* d_keys_in,
295
+ KeyT* d_keys_out,
296
+ ::cuda::std::int64_t num_items,
297
+ ::cuda::std::int64_t num_segments,
298
+ BeginOffsetIteratorT d_begin_offsets,
299
+ EndOffsetIteratorT d_end_offsets,
300
+ cudaStream_t stream = 0)
301
+ {
302
+ constexpr bool is_overwrite_okay = false;
303
+
304
+ using OffsetT =
305
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
306
+ using DispatchT =
307
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
308
+
309
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
310
+ DoubleBuffer<NullType> d_values;
311
+
312
+ return DispatchT::Dispatch(
313
+ d_temp_storage,
314
+ temp_storage_bytes,
315
+ d_keys,
316
+ d_values,
317
+ num_items,
318
+ num_segments,
319
+ d_begin_offsets,
320
+ d_end_offsets,
321
+ is_overwrite_okay,
322
+ stream);
323
+ }
324
+
325
+ public:
326
+ //! @rst
327
+ //! Sorts segments of keys into descending order. Approximately
328
+ //! ``num_items + 2 * num_segments`` auxiliary storage required.
329
+ //!
330
+ //! - The contents of the input data are not altered by the sorting operation.
331
+ //! - When the input is a contiguous sequence of segments, a single sequence
332
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
333
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
334
+ //! the latter is specified as ``segment_offsets + 1``).
335
+ //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
336
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
337
+ //! not guaranteed that the relative order of these two elements will be
338
+ //! preserved by sort.
339
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
340
+ //! ``[d_keys_in, d_keys_in + num_items)``,
341
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
342
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
343
+ //! - Segments are not required to be contiguous. For all index values ``i``
344
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
345
+ //! be accessed nor modified.
346
+ //!
347
+ //! Snippet
348
+ //! +++++++++++++++++++++++++++++++++++++++++++++
349
+ //!
350
+ //! The code snippet below illustrates the batched sorting of three segments
351
+ //! (with one zero-length segment) of ``i`` nt keys.
352
+ //!
353
+ //! .. code-block:: c++
354
+ //!
355
+ //! #include <cub/cub.cuh>
356
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
357
+ //!
358
+ //! // Declare, allocate, and initialize device-accessible pointers
359
+ //! // for sorting data
360
+ //! int num_items; // e.g., 7
361
+ //! int num_segments; // e.g., 3
362
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
363
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
364
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
365
+ //! ...
366
+ //!
367
+ //! // Determine temporary device storage requirements
368
+ //! void *d_temp_storage = nullptr;
369
+ //! size_t temp_storage_bytes = 0;
370
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
371
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
372
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
373
+ //!
374
+ //! // Allocate temporary storage
375
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
376
+ //!
377
+ //! // Run sorting operation
378
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
379
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
380
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
381
+ //!
382
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
383
+ //!
384
+ //! @endrst
385
+ //!
386
+ //! @tparam KeyT
387
+ //! **[inferred]** Key type
388
+ //!
389
+ //! @tparam BeginOffsetIteratorT
390
+ //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
391
+ //!
392
+ //! @tparam EndOffsetIteratorT
393
+ //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
394
+ //!
395
+ //! @param[in] d_temp_storage
396
+ //! Device-accessible allocation of temporary storage. When nullptr, the
397
+ //! required allocation size is written to `temp_storage_bytes` and no work is done
398
+ //!
399
+ //! @param[in,out] temp_storage_bytes
400
+ //! Reference to size in bytes of `d_temp_storage` allocation
401
+ //!
402
+ //! @param[in] d_keys_in
403
+ //! Device-accessible pointer to the input data of key data to sort
404
+ //!
405
+ //! @param[out] d_keys_out
406
+ //! Device-accessible pointer to the sorted output sequence of key data
407
+ //!
408
+ //! @param[in] num_items
409
+ //! The total number of items to sort (across all segments)
410
+ //!
411
+ //! @param[in] num_segments
412
+ //! The number of segments that comprise the sorting data
413
+ //!
414
+ //! @param[in] d_begin_offsets
415
+ //! @rst
416
+ //! Random-access input iterator to the sequence of beginning offsets of
417
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
418
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
419
+ //! @endrst
420
+ //!
421
+ //! @param[in] d_end_offsets
422
+ //! @rst
423
+ //! Random-access input iterator to the sequence of ending offsets of length
424
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
425
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
426
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
427
+ //! @endrst
428
+ //!
429
+ //! @param[in] stream
430
+ //! @rst
431
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
432
+ //! @endrst
433
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
434
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
435
+ void* d_temp_storage,
436
+ size_t& temp_storage_bytes,
437
+ const KeyT* d_keys_in,
438
+ KeyT* d_keys_out,
439
+ ::cuda::std::int64_t num_items,
440
+ ::cuda::std::int64_t num_segments,
441
+ BeginOffsetIteratorT d_begin_offsets,
442
+ EndOffsetIteratorT d_end_offsets,
443
+ cudaStream_t stream = 0)
444
+ {
445
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
446
+ return SortKeysDescendingNoNVTX(
447
+ d_temp_storage,
448
+ temp_storage_bytes,
449
+ d_keys_in,
450
+ d_keys_out,
451
+ num_items,
452
+ num_segments,
453
+ d_begin_offsets,
454
+ d_end_offsets,
455
+ stream);
456
+ }
457
+
458
+ private:
459
+ // Internal version without NVTX range
460
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
461
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
462
+ void* d_temp_storage,
463
+ size_t& temp_storage_bytes,
464
+ DoubleBuffer<KeyT>& d_keys,
465
+ ::cuda::std::int64_t num_items,
466
+ ::cuda::std::int64_t num_segments,
467
+ BeginOffsetIteratorT d_begin_offsets,
468
+ EndOffsetIteratorT d_end_offsets,
469
+ cudaStream_t stream = 0)
470
+ {
471
+ constexpr bool is_overwrite_okay = true;
472
+ using OffsetT =
473
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
474
+ using DispatchT =
475
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
476
+
477
+ DoubleBuffer<NullType> d_values;
478
+
479
+ return DispatchT::Dispatch(
480
+ d_temp_storage,
481
+ temp_storage_bytes,
482
+ d_keys,
483
+ d_values,
484
+ num_items,
485
+ num_segments,
486
+ d_begin_offsets,
487
+ d_end_offsets,
488
+ is_overwrite_okay,
489
+ stream);
490
+ }
491
+
492
+ public:
493
+ //! @rst
494
+ //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required.
495
+ //!
496
+ //! - The sorting operation is given a pair of key buffers managed by a
497
+ //! DoubleBuffer structure that indicates which of the two buffers is
498
+ //! "current" (and thus contains the input data to be sorted).
499
+ //! - The contents of both buffers may be altered by the sorting operation.
500
+ //! - Upon completion, the sorting operation will update the "current"
501
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
502
+ //! buffers now contains the sorted output sequence (a function of the number
503
+ //! of key bits and the targeted device architecture).
504
+ //! - When the input is a contiguous sequence of segments, a single sequence
505
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
506
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
507
+ //! the latter is specified as ``segment_offsets +1``).
508
+ //! - SortKeys is not guaranteed to be stable. That is, suppose that
509
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
510
+ //! not guaranteed that the relative order of these two elements will be
511
+ //! preserved by sort.
512
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
513
+ //! The range ``[cur, cur + num_items)`` shall not overlap
514
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
515
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
516
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
517
+ //! - Segments are not required to be contiguous. For all index values ``i``
518
+ //! outside the specified segments ``d_keys.Current()[i]``,
519
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
520
+ //!
521
+ //! Snippet
522
+ //! +++++++++++++++++++++++++++++++++++++++++++++
523
+ //!
524
+ //! The code snippet below illustrates the batched sorting of three segments
525
+ //! (with one zero-length segment) of ``i`` nt keys.
526
+ //!
527
+ //! .. code-block:: c++
528
+ //!
529
+ //! #include <cub/cub.cuh>
530
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
531
+ //!
532
+ //! // Declare, allocate, and initialize device-accessible
533
+ //! // pointers for sorting data
534
+ //! int num_items; // e.g., 7
535
+ //! int num_segments; // e.g., 3
536
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
537
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
538
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
539
+ //! ...
540
+ //!
541
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
542
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
543
+ //!
544
+ //! // Determine temporary device storage requirements
545
+ //! void *d_temp_storage = nullptr;
546
+ //! size_t temp_storage_bytes = 0;
547
+ //! cub::DeviceSegmentedSort::SortKeys(
548
+ //! d_temp_storage, temp_storage_bytes, d_keys,
549
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
550
+ //!
551
+ //! // Allocate temporary storage
552
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
553
+ //!
554
+ //! // Run sorting operation
555
+ //! cub::DeviceSegmentedSort::SortKeys(
556
+ //! d_temp_storage, temp_storage_bytes, d_keys,
557
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
558
+ //!
559
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
560
+ //!
561
+ //! @endrst
562
+ //!
563
+ //! @tparam KeyT
564
+ //! **[inferred]** Key type
565
+ //!
566
+ //! @tparam BeginOffsetIteratorT
567
+ //! **[inferred]** Random-access input iterator type for reading segment
568
+ //! beginning offsets @iterator
569
+ //!
570
+ //! @tparam EndOffsetIteratorT
571
+ //! **[inferred]** Random-access input iterator type for reading segment
572
+ //! ending offsets @iterator
573
+ //!
574
+ //! @param[in] d_temp_storage
575
+ //! Device-accessible allocation of temporary storage. When nullptr, the
576
+ //! required allocation size is written to `temp_storage_bytes` and no
577
+ //! work is done
578
+ //!
579
+ //! @param[in,out] temp_storage_bytes
580
+ //! Reference to size in bytes of `d_temp_storage` allocation
581
+ //!
582
+ //! @param[in,out] d_keys
583
+ //! Reference to the double-buffer of keys whose "current" device-accessible
584
+ //! buffer contains the unsorted input keys and, upon return, is updated to
585
+ //! point to the sorted output keys
586
+ //!
587
+ //! @param[in] num_items
588
+ //! The total number of items to sort (across all segments)
589
+ //!
590
+ //! @param[in] num_segments
591
+ //! The number of segments that comprise the sorting data
592
+ //!
593
+ //! @param[in] d_begin_offsets
594
+ //! @rst
595
+ //! Random-access input iterator to the sequence of beginning offsets of
596
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
597
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
598
+ //! @endrst
599
+ //!
600
+ //! @param[in] d_end_offsets
601
+ //! @rst
602
+ //! Random-access input iterator to the sequence of ending offsets of length
603
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
604
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
605
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
606
+ //! @endrst
607
+ //!
608
+ //! @param[in] stream
609
+ //! @rst
610
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
611
+ //! @endrst
612
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
613
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
614
+ void* d_temp_storage,
615
+ size_t& temp_storage_bytes,
616
+ DoubleBuffer<KeyT>& d_keys,
617
+ ::cuda::std::int64_t num_items,
618
+ ::cuda::std::int64_t num_segments,
619
+ BeginOffsetIteratorT d_begin_offsets,
620
+ EndOffsetIteratorT d_end_offsets,
621
+ cudaStream_t stream = 0)
622
+ {
623
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
624
+ return SortKeysNoNVTX(
625
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
626
+ }
627
+
628
+ private:
629
+ // Internal version without NVTX range
630
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
631
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
632
+ void* d_temp_storage,
633
+ size_t& temp_storage_bytes,
634
+ DoubleBuffer<KeyT>& d_keys,
635
+ ::cuda::std::int64_t num_items,
636
+ ::cuda::std::int64_t num_segments,
637
+ BeginOffsetIteratorT d_begin_offsets,
638
+ EndOffsetIteratorT d_end_offsets,
639
+ cudaStream_t stream = 0)
640
+ {
641
+ constexpr bool is_overwrite_okay = true;
642
+ using OffsetT =
643
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
644
+ using DispatchT =
645
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
646
+
647
+ DoubleBuffer<NullType> d_values;
648
+
649
+ return DispatchT::Dispatch(
650
+ d_temp_storage,
651
+ temp_storage_bytes,
652
+ d_keys,
653
+ d_values,
654
+ num_items,
655
+ num_segments,
656
+ d_begin_offsets,
657
+ d_end_offsets,
658
+ is_overwrite_okay,
659
+ stream);
660
+ }
661
+
662
+ public:
663
+ //! @rst
664
+ //! Sorts segments of keys into descending order. Approximately
665
+ //! ``2 * num_segments`` auxiliary storage required.
666
+ //!
667
+ //! - The sorting operation is given a pair of key buffers managed by a
668
+ //! DoubleBuffer structure that indicates which of the two buffers is
669
+ //! "current" (and thus contains the input data to be sorted).
670
+ //! - The contents of both buffers may be altered by the sorting operation.
671
+ //! - Upon completion, the sorting operation will update the "current"
672
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
673
+ //! buffers now contains the sorted output sequence (a function of the number
674
+ //! of key bits and the targeted device architecture).
675
+ //! - When the input is a contiguous sequence of segments, a single sequence
676
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
677
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
678
+ //! the latter is specified as ``segment_offsets + 1``).
679
+ //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
680
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
681
+ //! not guaranteed that the relative order of these two elements will be
682
+ //! preserved by sort.
683
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
684
+ //! The range ``[cur, cur + num_items)`` shall not overlap
685
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
686
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
687
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
688
+ //! - Segments are not required to be contiguous. For all index values ``i``
689
+ //! outside the specified segments ``d_keys.Current()[i]``,
690
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
691
+ //!
692
+ //! Snippet
693
+ //! +++++++++++++++++++++++++++++++++++++++++++++
694
+ //!
695
+ //! The code snippet below illustrates the batched sorting of three segments
696
+ //! (with one zero-length segment) of ``i`` nt keys.
697
+ //!
698
+ //! .. code-block:: c++
699
+ //!
700
+ //! #include <cub/cub.cuh>
701
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
702
+ //!
703
+ //! // Declare, allocate, and initialize device-accessible pointers for
704
+ //! // sorting data
705
+ //! int num_items; // e.g., 7
706
+ //! int num_segments; // e.g., 3
707
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
708
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
709
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
710
+ //! ...
711
+ //!
712
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
713
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
714
+ //!
715
+ //! // Determine temporary device storage requirements
716
+ //! void *d_temp_storage = nullptr;
717
+ //! size_t temp_storage_bytes = 0;
718
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
719
+ //! d_temp_storage, temp_storage_bytes, d_keys,
720
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
721
+ //!
722
+ //! // Allocate temporary storage
723
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
724
+ //!
725
+ //! // Run sorting operation
726
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
727
+ //! d_temp_storage, temp_storage_bytes, d_keys,
728
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
729
+ //!
730
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
731
+ //!
732
+ //! @endrst
733
+ //!
734
+ //! @tparam KeyT
735
+ //! **[inferred]** Key type
736
+ //!
737
+ //! @tparam BeginOffsetIteratorT
738
+ //! **[inferred]** Random-access input iterator type for reading segment
739
+ //! beginning offsets @iterator
740
+ //!
741
+ //! @tparam EndOffsetIteratorT
742
+ //! **[inferred]** Random-access input iterator type for reading segment
743
+ //! ending offsets @iterator
744
+ //!
745
+ //! @param[in] d_temp_storage
746
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
747
+ //! required allocation size is written to `temp_storage_bytes` and no work
748
+ //! is done
749
+ //!
750
+ //! @param[in,out] temp_storage_bytes
751
+ //! Reference to size in bytes of `d_temp_storage` allocation
752
+ //!
753
+ //! @param[in,out] d_keys
754
+ //! Reference to the double-buffer of keys whose "current" device-accessible
755
+ //! buffer contains the unsorted input keys and, upon return, is updated to
756
+ //! point to the sorted output keys
757
+ //!
758
+ //! @param[in] num_items
759
+ //! The total number of items to sort (across all segments)
760
+ //!
761
+ //! @param[in] num_segments
762
+ //! The number of segments that comprise the sorting data
763
+ //!
764
+ //! @param[in] d_begin_offsets
765
+ //! @rst
766
+ //! Random-access input iterator to the sequence of beginning offsets of
767
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
768
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
769
+ //! @endrst
770
+ //!
771
+ //! @param[in] d_end_offsets
772
+ //! @rst
773
+ //! Random-access input iterator to the sequence of ending offsets of length
774
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
775
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
776
+ //! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
777
+ //! @endrst
778
+ //!
779
+ //! @param[in] stream
780
+ //! @rst
781
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
782
+ //! @endrst
783
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
784
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
785
+ void* d_temp_storage,
786
+ size_t& temp_storage_bytes,
787
+ DoubleBuffer<KeyT>& d_keys,
788
+ ::cuda::std::int64_t num_items,
789
+ ::cuda::std::int64_t num_segments,
790
+ BeginOffsetIteratorT d_begin_offsets,
791
+ EndOffsetIteratorT d_end_offsets,
792
+ cudaStream_t stream = 0)
793
+ {
794
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
795
+ return SortKeysDescendingNoNVTX(
796
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
797
+ }
798
+
799
+ //! @rst
800
+ //! Sorts segments of keys into ascending order. Approximately
801
+ //! ``num_items + 2 * num_segments`` auxiliary storage required.
802
+ //!
803
+ //! - The contents of the input data are not altered by the sorting operation.
804
+ //! - When the input is a contiguous sequence of segments, a single sequence
805
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
806
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
807
+ //! the latter is specified as ``segment_offsets + 1``).
808
+ //! - StableSortKeys is stable: it preserves the relative ordering of
809
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
810
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
811
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
812
+ //! ``x`` still precedes ``y``.
813
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
814
+ //! ``[d_keys_in, d_keys_in + num_items)``,
815
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
816
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
817
+ //! - Segments are not required to be contiguous. For all index values ``i``
818
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
819
+ //! be accessed nor modified.
820
+ //!
821
+ //! Snippet
822
+ //! +++++++++++++++++++++++++++++++++++++++++++++
823
+ //!
824
+ //! The code snippet below illustrates the batched sorting of three segments
825
+ //! (with one zero-length segment) of ``i`` nt keys.
826
+ //!
827
+ //! .. code-block:: c++
828
+ //!
829
+ //! #include <cub/cub.cuh>
830
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
831
+ //!
832
+ //! // Declare, allocate, and initialize device-accessible pointers
833
+ //! // for sorting data
834
+ //! int num_items; // e.g., 7
835
+ //! int num_segments; // e.g., 3
836
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
837
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
838
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
839
+ //! ...
840
+ //!
841
+ //! // Determine temporary device storage requirements
842
+ //! void *d_temp_storage = nullptr;
843
+ //! size_t temp_storage_bytes = 0;
844
+ //! cub::DeviceSegmentedSort::StableSortKeys(
845
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
846
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
847
+ //!
848
+ //! // Allocate temporary storage
849
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
850
+ //!
851
+ //! // Run sorting operation
852
+ //! cub::DeviceSegmentedSort::StableSortKeys(
853
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
854
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
855
+ //!
856
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
857
+ //!
858
+ //! @endrst
859
+ //!
860
+ //! @tparam KeyT
861
+ //! **[inferred]** Key type
862
+ //!
863
+ //! @tparam BeginOffsetIteratorT
864
+ //! **[inferred]** Random-access input iterator type for reading segment
865
+ //! beginning offsets @iterator
866
+ //!
867
+ //! @tparam EndOffsetIteratorT
868
+ //! **[inferred]** Random-access input iterator type for reading segment
869
+ //! ending offsets @iterator
870
+ //!
871
+ //! @param[in] d_temp_storage
872
+ //! Device-accessible allocation of temporary storage. When nullptr, the
873
+ //! required allocation size is written to `temp_storage_bytes` and no work
874
+ //! is done
875
+ //!
876
+ //! @param[in,out] temp_storage_bytes
877
+ //! Reference to size in bytes of `d_temp_storage` allocation
878
+ //!
879
+ //! @param[in] d_keys_in
880
+ //! Device-accessible pointer to the input data of key data to sort
881
+ //!
882
+ //! @param[out] d_keys_out
883
+ //! Device-accessible pointer to the sorted output sequence of key data
884
+ //!
885
+ //! @param[in] num_items
886
+ //! The total number of items to sort (across all segments)
887
+ //!
888
+ //! @param[in] num_segments
889
+ //! The number of segments that comprise the sorting data
890
+ //!
891
+ //! @param[in] d_begin_offsets
892
+ //! @rst
893
+ //! Random-access input iterator to the sequence of beginning offsets of
894
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
895
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
896
+ //! @endrst
897
+ //!
898
+ //! @param[in] d_end_offsets
899
+ //! @rst
900
+ //! Random-access input iterator to the sequence of ending offsets of length
901
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
902
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
903
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
904
+ //! @endrst
905
+ //!
906
+ //! @param[in] stream
907
+ //! @rst
908
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
909
+ //! @endrst
910
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
911
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
912
+ void* d_temp_storage,
913
+ size_t& temp_storage_bytes,
914
+ const KeyT* d_keys_in,
915
+ KeyT* d_keys_out,
916
+ ::cuda::std::int64_t num_items,
917
+ ::cuda::std::int64_t num_segments,
918
+ BeginOffsetIteratorT d_begin_offsets,
919
+ EndOffsetIteratorT d_end_offsets,
920
+ cudaStream_t stream = 0)
921
+ {
922
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
923
+ return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
924
+ d_temp_storage,
925
+ temp_storage_bytes,
926
+ d_keys_in,
927
+ d_keys_out,
928
+ num_items,
929
+ num_segments,
930
+ d_begin_offsets,
931
+ d_end_offsets,
932
+ stream);
933
+ }
934
+
935
+ //! @rst
936
+ //! Sorts segments of keys into descending order.
937
+ //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
938
+ //!
939
+ //! - The contents of the input data are not altered by the sorting operation.
940
+ //! - When the input is a contiguous sequence of segments, a single sequence
941
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
942
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
943
+ //! the latter is specified as ``segment_offsets + 1``).
944
+ //! - StableSortKeysDescending is stable: it preserves the relative ordering of
945
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
946
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``)
947
+ //! then a postcondition of stable sort is that ``x`` still precedes ``y``.
948
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
949
+ //! ``[d_keys_in, d_keys_in + num_items)``,
950
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
951
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
952
+ //! - Segments are not required to be contiguous. For all index values ``i``
953
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
954
+ //! be accessed nor modified.
955
+ //!
956
+ //! Snippet
957
+ //! +++++++++++++++++++++++++++++++++++++++++++++
958
+ //!
959
+ //! The code snippet below illustrates the batched sorting of three segments
960
+ //! (with one zero-length segment) of ``i`` nt keys.
961
+ //!
962
+ //! .. code-block:: c++
963
+ //!
964
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
965
+ //!
966
+ //! // Declare, allocate, and initialize device-accessible pointers
967
+ //! // for sorting data
968
+ //! int num_items; // e.g., 7
969
+ //! int num_segments; // e.g., 3
970
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
971
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
972
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
973
+ //! ...
974
+ //!
975
+ //! // Determine temporary device storage requirements
976
+ //! void *d_temp_storage = nullptr;
977
+ //! size_t temp_storage_bytes = 0;
978
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
979
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
980
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
981
+ //!
982
+ //! // Allocate temporary storage
983
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
984
+ //!
985
+ //! // Run sorting operation
986
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
987
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
988
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
989
+ //!
990
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
991
+ //!
992
+ //! @endrst
993
+ //!
994
+ //! @tparam KeyT
995
+ //! **[inferred]** Key type
996
+ //!
997
+ //! @tparam BeginOffsetIteratorT
998
+ //! **[inferred]** Random-access input iterator type for reading segment
999
+ //! beginning offsets @iterator
1000
+ //!
1001
+ //! @tparam EndOffsetIteratorT
1002
+ //! **[inferred]** Random-access input iterator type for reading segment
1003
+ //! ending offsets @iterator
1004
+ //!
1005
+ //! @param[in] d_temp_storage
1006
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1007
+ //! required allocation size is written to `temp_storage_bytes` and no work
1008
+ //! is done.
1009
+ //!
1010
+ //! @param[in,out] temp_storage_bytes
1011
+ //! Reference to size in bytes of `d_temp_storage` allocation
1012
+ //!
1013
+ //! @param[in] d_keys_in
1014
+ //! Device-accessible pointer to the input data of key data to sort
1015
+ //!
1016
+ //! @param[out] d_keys_out
1017
+ //! Device-accessible pointer to the sorted output sequence of key data
1018
+ //!
1019
+ //! @param[in] num_items
1020
+ //! The total number of items to sort (across all segments)
1021
+ //!
1022
+ //! @param[in] num_segments
1023
+ //! The number of segments that comprise the sorting data
1024
+ //!
1025
+ //! @param[in] d_begin_offsets
1026
+ //! @rst
1027
+ //! Random-access input iterator to the sequence of beginning offsets of
1028
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1029
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
1030
+ //! ``d_values_*``
1031
+ //! @endrst
1032
+ //!
1033
+ //! @param[in] d_end_offsets
1034
+ //! @rst
1035
+ //! Random-access input iterator to the sequence of ending offsets of length
1036
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1037
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1038
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
1039
+ //! considered empty.
1040
+ //! @endrst
1041
+ //!
1042
+ //! @param[in] stream
1043
+ //! @rst
1044
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1045
+ //! @endrst
1046
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1047
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
1048
+ void* d_temp_storage,
1049
+ size_t& temp_storage_bytes,
1050
+ const KeyT* d_keys_in,
1051
+ KeyT* d_keys_out,
1052
+ ::cuda::std::int64_t num_items,
1053
+ ::cuda::std::int64_t num_segments,
1054
+ BeginOffsetIteratorT d_begin_offsets,
1055
+ EndOffsetIteratorT d_end_offsets,
1056
+ cudaStream_t stream = 0)
1057
+ {
1058
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1059
+ return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1060
+ d_temp_storage,
1061
+ temp_storage_bytes,
1062
+ d_keys_in,
1063
+ d_keys_out,
1064
+ num_items,
1065
+ num_segments,
1066
+ d_begin_offsets,
1067
+ d_end_offsets,
1068
+ stream);
1069
+ }
1070
+
1071
+ //! @rst
1072
+ //! Sorts segments of keys into ascending order.
1073
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1074
+ //!
1075
+ //! - The sorting operation is given a pair of key buffers managed by a
1076
+ //! DoubleBuffer structure that indicates which of the two buffers is
1077
+ //! "current" (and thus contains the input data to be sorted).
1078
+ //! - The contents of both buffers may be altered by the sorting operation.
1079
+ //! - Upon completion, the sorting operation will update the "current"
1080
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
1081
+ //! buffers now contains the sorted output sequence (a function of the number
1082
+ //! of key bits and the targeted device architecture).
1083
+ //! - When the input is a contiguous sequence of segments, a single sequence
1084
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1085
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1086
+ //! the latter is specified as ``segment_offsets + 1``).
1087
+ //! - StableSortKeys is stable: it preserves the relative ordering of
1088
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
1089
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
1090
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
1091
+ //! ``x`` still precedes ``y``.
1092
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
1093
+ //! The range ``[cur, cur + num_items)`` shall not overlap
1094
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1095
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1096
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1097
+ //! - Segments are not required to be contiguous. For all index values ``i``
1098
+ //! outside the specified segments ``d_keys.Current()[i]``,
1099
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
1100
+ //!
1101
+ //! Snippet
1102
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1103
+ //!
1104
+ //! The code snippet below illustrates the batched sorting of three segments
1105
+ //! (with one zero-length segment) of ``i`` nt keys.
1106
+ //!
1107
+ //! .. code-block:: c++
1108
+ //!
1109
+ //! #include <cub/cub.cuh>
1110
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1111
+ //!
1112
+ //! // Declare, allocate, and initialize device-accessible pointers
1113
+ //! // for sorting data
1114
+ //! int num_items; // e.g., 7
1115
+ //! int num_segments; // e.g., 3
1116
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1117
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1118
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1119
+ //! ...
1120
+ //!
1121
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
1122
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1123
+ //!
1124
+ //! // Determine temporary device storage requirements
1125
+ //! void *d_temp_storage = nullptr;
1126
+ //! size_t temp_storage_bytes = 0;
1127
+ //! cub::DeviceSegmentedSort::StableSortKeys(
1128
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1129
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1130
+ //!
1131
+ //! // Allocate temporary storage
1132
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1133
+ //!
1134
+ //! // Run sorting operation
1135
+ //! cub::DeviceSegmentedSort::StableSortKeys(
1136
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1137
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1138
+ //!
1139
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
1140
+ //!
1141
+ //! @endrst
1142
+ //!
1143
+ //! @tparam KeyT
1144
+ //! **[inferred]** Key type
1145
+ //!
1146
+ //! @tparam BeginOffsetIteratorT
1147
+ //! **[inferred]** Random-access input iterator type for reading segment
1148
+ //! beginning offsets @iterator
1149
+ //!
1150
+ //! @tparam EndOffsetIteratorT
1151
+ //! **[inferred]** Random-access input iterator type for reading segment
1152
+ //! ending offsets @iterator
1153
+ //!
1154
+ //! @param[in] d_temp_storage
1155
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1156
+ //! required allocation size is written to `temp_storage_bytes` and no work
1157
+ //! is done
1158
+ //!
1159
+ //! @param[in,out] temp_storage_bytes
1160
+ //! Reference to size in bytes of `d_temp_storage` allocation
1161
+ //!
1162
+ //! @param[in,out] d_keys
1163
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1164
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1165
+ //! point to the sorted output keys
1166
+ //!
1167
+ //! @param[in] num_items
1168
+ //! The total number of items to sort (across all segments)
1169
+ //!
1170
+ //! @param[in] num_segments
1171
+ //! The number of segments that comprise the sorting data
1172
+ //!
1173
+ //! @param[in] d_begin_offsets
1174
+ //! @rst
1175
+ //! Random-access input iterator to the sequence of beginning offsets of
1176
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1177
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1178
+ //! @endrst
1179
+ //!
1180
+ //! @param[in] d_end_offsets
1181
+ //! @rst
1182
+ //! Random-access input iterator to the sequence of ending offsets of length
1183
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1184
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1185
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
1186
+ //! considered empty.
1187
+ //! @endrst
1188
+ //!
1189
+ //! @param[in] stream
1190
+ //! @rst
1191
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1192
+ //! @endrst
1193
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1194
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
1195
+ void* d_temp_storage,
1196
+ size_t& temp_storage_bytes,
1197
+ DoubleBuffer<KeyT>& d_keys,
1198
+ ::cuda::std::int64_t num_items,
1199
+ ::cuda::std::int64_t num_segments,
1200
+ BeginOffsetIteratorT d_begin_offsets,
1201
+ EndOffsetIteratorT d_end_offsets,
1202
+ cudaStream_t stream = 0)
1203
+ {
1204
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1205
+ return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1206
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
1207
+ }
1208
+
1209
+ //! @rst
1210
+ //! Sorts segments of keys into descending order.
1211
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1212
+ //!
1213
+ //! - The sorting operation is given a pair of key buffers managed by a
1214
+ //! DoubleBuffer structure that indicates which of the two buffers is
1215
+ //! "current" (and thus contains the input data to be sorted).
1216
+ //! - The contents of both buffers may be altered by the sorting operation.
1217
+ //! - Upon completion, the sorting operation will update the "current"
1218
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
1219
+ //! buffers now contains the sorted output sequence (a function of the number
1220
+ //! of key bits and the targeted device architecture).
1221
+ //! - When the input is a contiguous sequence of segments, a single sequence
1222
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1223
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1224
+ //! the latter is specified as ``segment_offsets + 1``).
1225
+ //! - StableSortKeysDescending is stable: it preserves the relative ordering of
1226
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
1227
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
1228
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
1229
+ //! ``x`` still precedes ``y``.
1230
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
1231
+ //! The range ``[cur, cur + num_items)`` shall not overlap
1232
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1233
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1234
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1235
+ //! - Segments are not required to be contiguous. For all index values ```i`
1236
+ //! outside the specified segments ``d_keys.Current()[i]``,
1237
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
1238
+ //!
1239
+ //! Snippet
1240
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1241
+ //!
1242
+ //! The code snippet below illustrates the batched sorting of three segments
1243
+ //! (with one zero-length segment) of ``i`` nt keys.
1244
+ //!
1245
+ //! .. code-block:: c++
1246
+ //!
1247
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
1248
+ //!
1249
+ //! // Declare, allocate, and initialize device-accessible pointers
1250
+ //! // for sorting data
1251
+ //! int num_items; // e.g., 7
1252
+ //! int num_segments; // e.g., 3
1253
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1254
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1255
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1256
+ //! ...
1257
+ //!
1258
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
1259
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1260
+ //!
1261
+ //! // Determine temporary device storage requirements
1262
+ //! void *d_temp_storage = nullptr;
1263
+ //! size_t temp_storage_bytes = 0;
1264
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1265
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1266
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1267
+ //!
1268
+ //! // Allocate temporary storage
1269
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1270
+ //!
1271
+ //! // Run sorting operation
1272
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1273
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1274
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1275
+ //!
1276
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
1277
+ //!
1278
+ //! @endrst
1279
+ //!
1280
+ //! @tparam KeyT
1281
+ //! **[inferred]** Key type
1282
+ //!
1283
+ //! @tparam BeginOffsetIteratorT
1284
+ //! **[inferred]** Random-access input iterator type for reading segment
1285
+ //! beginning offsets @iterator
1286
+ //!
1287
+ //! @tparam EndOffsetIteratorT
1288
+ //! **[inferred]** Random-access input iterator type for reading segment
1289
+ //! ending offsets @iterator
1290
+ //!
1291
+ //! @param[in] d_temp_storage
1292
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1293
+ //! required allocation size is written to `temp_storage_bytes` and no work
1294
+ //! is done.
1295
+ //!
1296
+ //! @param[in,out] temp_storage_bytes
1297
+ //! Reference to size in bytes of `d_temp_storage` allocation
1298
+ //!
1299
+ //! @param[in,out] d_keys
1300
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1301
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1302
+ //! point to the sorted output keys
1303
+ //!
1304
+ //! @param[in] num_items
1305
+ //! The total number of items to sort (across all segments)
1306
+ //!
1307
+ //! @param[in] num_segments
1308
+ //! The number of segments that comprise the sorting data
1309
+ //!
1310
+ //! @param[in] d_begin_offsets
1311
+ //! @rst
1312
+ //! Random-access input iterator to the sequence of beginning offsets of
1313
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1314
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1315
+ //! @endrst
1316
+ //!
1317
+ //! @param[in] d_end_offsets
1318
+ //! @rst
1319
+ //! Random-access input iterator to the sequence of ending offsets of length
1320
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
1321
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
1322
+ //! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
1323
+ //! ``i``-th segment is considered empty.
1324
+ //! @endrst
1325
+ //!
1326
+ //! @param[in] stream
1327
+ //! @rst
1328
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1329
+ //! @endrst
1330
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1331
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
1332
+ void* d_temp_storage,
1333
+ size_t& temp_storage_bytes,
1334
+ DoubleBuffer<KeyT>& d_keys,
1335
+ ::cuda::std::int64_t num_items,
1336
+ ::cuda::std::int64_t num_segments,
1337
+ BeginOffsetIteratorT d_begin_offsets,
1338
+ EndOffsetIteratorT d_end_offsets,
1339
+ cudaStream_t stream = 0)
1340
+ {
1341
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1342
+ return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1343
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
1344
+ }
1345
+
1346
+ private:
1347
+ // Internal version without NVTX range
1348
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1349
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
1350
+ void* d_temp_storage,
1351
+ size_t& temp_storage_bytes,
1352
+ const KeyT* d_keys_in,
1353
+ KeyT* d_keys_out,
1354
+ const ValueT* d_values_in,
1355
+ ValueT* d_values_out,
1356
+ ::cuda::std::int64_t num_items,
1357
+ ::cuda::std::int64_t num_segments,
1358
+ BeginOffsetIteratorT d_begin_offsets,
1359
+ EndOffsetIteratorT d_end_offsets,
1360
+ cudaStream_t stream = 0)
1361
+ {
1362
+ constexpr bool is_overwrite_okay = false;
1363
+
1364
+ using OffsetT =
1365
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1366
+ using DispatchT =
1367
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1368
+
1369
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1370
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1371
+
1372
+ return DispatchT::Dispatch(
1373
+ d_temp_storage,
1374
+ temp_storage_bytes,
1375
+ d_keys,
1376
+ d_values,
1377
+ num_items,
1378
+ num_segments,
1379
+ d_begin_offsets,
1380
+ d_end_offsets,
1381
+ is_overwrite_okay,
1382
+ stream);
1383
+ }
1384
+
1385
+ public:
1386
+ //! @} end member group
1387
+ //! @name Key-value pairs
1388
+ //! @{
1389
+
1390
+ //! @rst
1391
+ //! Sorts segments of key-value pairs into ascending order.
1392
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
1393
+ //!
1394
+ //! - The contents of the input data are not altered by the sorting operation.
1395
+ //! - When the input is a contiguous sequence of segments, a single sequence
1396
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1397
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1398
+ //! the latter is specified as ``segment_offsets + 1``).
1399
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1400
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1401
+ //! guaranteed that the relative order of these two elements will be
1402
+ //! preserved by sort.
1403
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
1404
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
1405
+ //! not overlap ``[in, in + num_items)``,
1406
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1407
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1408
+ //! - Segments are not required to be contiguous. For all index values ``i``
1409
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
1410
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
1411
+ //!
1412
+ //! Snippet
1413
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1414
+ //!
1415
+ //! The code snippet below illustrates the batched sorting of three segments
1416
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1417
+ //! ``i`` nt values.
1418
+ //!
1419
+ //! .. code-block:: c++
1420
+ //!
1421
+ //! #include <cub/cub.cuh>
1422
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1423
+ //!
1424
+ //! // Declare, allocate, and initialize device-accessible pointers
1425
+ //! // for sorting data
1426
+ //! int num_items; // e.g., 7
1427
+ //! int num_segments; // e.g., 3
1428
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1429
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1430
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
1431
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1432
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
1433
+ //! ...
1434
+ //!
1435
+ //! // Determine temporary device storage requirements
1436
+ //! void *d_temp_storage = nullptr;
1437
+ //! size_t temp_storage_bytes = 0;
1438
+ //! cub::DeviceSegmentedSort::SortPairs(
1439
+ //! d_temp_storage, temp_storage_bytes,
1440
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1441
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1442
+ //!
1443
+ //! // Allocate temporary storage
1444
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1445
+ //!
1446
+ //! // Run sorting operation
1447
+ //! cub::DeviceSegmentedSort::SortPairs(
1448
+ //! d_temp_storage, temp_storage_bytes,
1449
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1450
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1451
+ //!
1452
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
1453
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
1454
+ //!
1455
+ //! @endrst
1456
+ //!
1457
+ //! @tparam KeyT
1458
+ //! **[inferred]** Key type
1459
+ //!
1460
+ //! @tparam ValueT
1461
+ //! **[inferred]** Value type
1462
+ //!
1463
+ //! @tparam BeginOffsetIteratorT
1464
+ //! **[inferred]** Random-access input iterator type for reading segment
1465
+ //! beginning offsets @iterator
1466
+ //!
1467
+ //! @tparam EndOffsetIteratorT
1468
+ //! **[inferred]** Random-access input iterator type for reading segment
1469
+ //! ending offsets @iterator
1470
+ //!
1471
+ //! @param[in] d_temp_storage
1472
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1473
+ //! required allocation size is written to `temp_storage_bytes` and no work
1474
+ //! is done
1475
+ //!
1476
+ //! @param[in,out] temp_storage_bytes
1477
+ //! Reference to size in bytes of `d_temp_storage` allocation
1478
+ //!
1479
+ //! @param[in] d_keys_in
1480
+ //! Device-accessible pointer to the input data of key data to sort
1481
+ //!
1482
+ //! @param[out] d_keys_out
1483
+ //! Device-accessible pointer to the sorted output sequence of key data
1484
+ //!
1485
+ //! @param[in] d_values_in
1486
+ //! Device-accessible pointer to the corresponding input sequence of
1487
+ //! associated value items
1488
+ //!
1489
+ //! @param[out] d_values_out
1490
+ //! Device-accessible pointer to the correspondingly-reordered output
1491
+ //! sequence of associated value items
1492
+ //!
1493
+ //! @param[in] num_items
1494
+ //! The total number of items to sort (across all segments)
1495
+ //!
1496
+ //! @param[in] num_segments
1497
+ //! The number of segments that comprise the sorting data
1498
+ //!
1499
+ //! @param[in] d_begin_offsets
1500
+ //! @rst
1501
+ //! Random-access input iterator to the sequence of beginning offsets of
1502
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1503
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1504
+ //! @endrst
1505
+ //!
1506
+ //! @param[in] d_end_offsets
1507
+ //! @rst
1508
+ //! Random-access input iterator to the sequence of ending offsets of length
1509
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1510
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1511
+ //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
1512
+ //! considered empty.
1513
+ //! @endrst
1514
+ //!
1515
+ //! @param[in] stream
1516
+ //! @rst
1517
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1518
+ //! @endrst
1519
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1520
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
1521
+ void* d_temp_storage,
1522
+ size_t& temp_storage_bytes,
1523
+ const KeyT* d_keys_in,
1524
+ KeyT* d_keys_out,
1525
+ const ValueT* d_values_in,
1526
+ ValueT* d_values_out,
1527
+ ::cuda::std::int64_t num_items,
1528
+ ::cuda::std::int64_t num_segments,
1529
+ BeginOffsetIteratorT d_begin_offsets,
1530
+ EndOffsetIteratorT d_end_offsets,
1531
+ cudaStream_t stream = 0)
1532
+ {
1533
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1534
+ return SortPairsNoNVTX(
1535
+ d_temp_storage,
1536
+ temp_storage_bytes,
1537
+ d_keys_in,
1538
+ d_keys_out,
1539
+ d_values_in,
1540
+ d_values_out,
1541
+ num_items,
1542
+ num_segments,
1543
+ d_begin_offsets,
1544
+ d_end_offsets,
1545
+ stream);
1546
+ }
1547
+
1548
+ private:
1549
+ // Internal version without NVTX range
1550
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1551
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
1552
+ void* d_temp_storage,
1553
+ size_t& temp_storage_bytes,
1554
+ const KeyT* d_keys_in,
1555
+ KeyT* d_keys_out,
1556
+ const ValueT* d_values_in,
1557
+ ValueT* d_values_out,
1558
+ ::cuda::std::int64_t num_items,
1559
+ ::cuda::std::int64_t num_segments,
1560
+ BeginOffsetIteratorT d_begin_offsets,
1561
+ EndOffsetIteratorT d_end_offsets,
1562
+ cudaStream_t stream = 0)
1563
+ {
1564
+ constexpr bool is_overwrite_okay = false;
1565
+
1566
+ using OffsetT =
1567
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1568
+ using DispatchT =
1569
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1570
+
1571
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1572
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1573
+
1574
+ return DispatchT::Dispatch(
1575
+ d_temp_storage,
1576
+ temp_storage_bytes,
1577
+ d_keys,
1578
+ d_values,
1579
+ num_items,
1580
+ num_segments,
1581
+ d_begin_offsets,
1582
+ d_end_offsets,
1583
+ is_overwrite_okay,
1584
+ stream);
1585
+ }
1586
+
1587
+ public:
1588
+ //! @rst
1589
+ //! Sorts segments of key-value pairs into descending order.
1590
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
1591
+ //!
1592
+ //! - The contents of the input data are not altered by the sorting operation.
1593
+ //! - When the input is a contiguous sequence of segments, a single sequence
1594
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1595
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1596
+ //! the latter is specified as ``segment_offsets + 1``).
1597
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1598
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1599
+ //! guaranteed that the relative order of these two elements will be
1600
+ //! preserved by sort.
1601
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
1602
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
1603
+ //! not overlap ``[in, in + num_items)``,
1604
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1605
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1606
+ //! - Segments are not required to be contiguous. For all index values ``i``
1607
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
1608
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
1609
+ //!
1610
+ //! Snippet
1611
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1612
+ //!
1613
+ //! The code snippet below illustrates the batched sorting of three segments
1614
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1615
+ //! ``i`` nt values.
1616
+ //!
1617
+ //! .. code-block:: c++
1618
+ //!
1619
+ //! #include <cub/cub.cuh>
1620
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1621
+ //!
1622
+ //! // Declare, allocate, and initialize device-accessible pointers for
1623
+ //! // sorting data
1624
+ //! int num_items; // e.g., 7
1625
+ //! int num_segments; // e.g., 3
1626
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1627
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1628
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
1629
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1630
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
1631
+ //! ...
1632
+ //!
1633
+ //! // Determine temporary device storage requirements
1634
+ //! void *d_temp_storage = nullptr;
1635
+ //! size_t temp_storage_bytes = 0;
1636
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
1637
+ //! d_temp_storage, temp_storage_bytes,
1638
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1639
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1640
+ //!
1641
+ //! // Allocate temporary storage
1642
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1643
+ //!
1644
+ //! // Run sorting operation
1645
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
1646
+ //! d_temp_storage, temp_storage_bytes,
1647
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1648
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1649
+ //!
1650
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
1651
+ //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
1652
+ //!
1653
+ //! @endrst
1654
+ //!
1655
+ //! @tparam KeyT
1656
+ //! **[inferred]** Key type
1657
+ //!
1658
+ //! @tparam ValueT
1659
+ //! **[inferred]** Value type
1660
+ //!
1661
+ //! @tparam BeginOffsetIteratorT
1662
+ //! **[inferred]** Random-access input iterator type for reading segment
1663
+ //! beginning offsets @iterator
1664
+ //!
1665
+ //! @tparam EndOffsetIteratorT
1666
+ //! **[inferred]** Random-access input iterator type for reading segment
1667
+ //! ending offsets @iterator
1668
+ //!
1669
+ //! @param[in] d_temp_storage
1670
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1671
+ //! required allocation size is written to `temp_storage_bytes` and no work
1672
+ //! is done.
1673
+ //!
1674
+ //! @param[in,out] temp_storage_bytes
1675
+ //! Reference to size in bytes of `d_temp_storage` allocation
1676
+ //!
1677
+ //! @param[in] d_keys_in
1678
+ //! Device-accessible pointer to the input data of key data to sort
1679
+ //!
1680
+ //! @param[out] d_keys_out
1681
+ //! Device-accessible pointer to the sorted output sequence of key data
1682
+ //!
1683
+ //! @param[in] d_values_in
1684
+ //! Device-accessible pointer to the corresponding input sequence of
1685
+ //! associated value items
1686
+ //!
1687
+ //! @param[out] d_values_out
1688
+ //! Device-accessible pointer to the correspondingly-reordered output
1689
+ //! sequence of associated value items
1690
+ //!
1691
+ //! @param[in] num_items
1692
+ //! The total number of items to sort (across all segments)
1693
+ //!
1694
+ //! @param[in] num_segments
1695
+ //! The number of segments that comprise the sorting data
1696
+ //!
1697
+ //! @param[in] d_begin_offsets
1698
+ //! @rst
1699
+ //! Random-access input iterator to the sequence of beginning offsets of
1700
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1701
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1702
+ //! @endrst
1703
+ //!
1704
+ //! @param[in] d_end_offsets
1705
+ //! @rst
1706
+ //! Random-access input iterator to the sequence of ending offsets of length
1707
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1708
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1709
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
1710
+ //! considered empty.
1711
+ //! @endrst
1712
+ //!
1713
+ //! @param[in] stream
1714
+ //! @rst
1715
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1716
+ //! @endrst
1717
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1718
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1719
+ void* d_temp_storage,
1720
+ size_t& temp_storage_bytes,
1721
+ const KeyT* d_keys_in,
1722
+ KeyT* d_keys_out,
1723
+ const ValueT* d_values_in,
1724
+ ValueT* d_values_out,
1725
+ ::cuda::std::int64_t num_items,
1726
+ ::cuda::std::int64_t num_segments,
1727
+ BeginOffsetIteratorT d_begin_offsets,
1728
+ EndOffsetIteratorT d_end_offsets,
1729
+ cudaStream_t stream = 0)
1730
+ {
1731
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1732
+ return SortPairsDescendingNoNVTX(
1733
+ d_temp_storage,
1734
+ temp_storage_bytes,
1735
+ d_keys_in,
1736
+ d_keys_out,
1737
+ d_values_in,
1738
+ d_values_out,
1739
+ num_items,
1740
+ num_segments,
1741
+ d_begin_offsets,
1742
+ d_end_offsets,
1743
+ stream);
1744
+ }
1745
+
1746
+ private:
1747
+ // Internal version without NVTX range
1748
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1749
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
1750
+ void* d_temp_storage,
1751
+ size_t& temp_storage_bytes,
1752
+ DoubleBuffer<KeyT>& d_keys,
1753
+ DoubleBuffer<ValueT>& d_values,
1754
+ ::cuda::std::int64_t num_items,
1755
+ ::cuda::std::int64_t num_segments,
1756
+ BeginOffsetIteratorT d_begin_offsets,
1757
+ EndOffsetIteratorT d_end_offsets,
1758
+ cudaStream_t stream = 0)
1759
+ {
1760
+ constexpr bool is_overwrite_okay = true;
1761
+
1762
+ using OffsetT =
1763
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1764
+ using DispatchT =
1765
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1766
+
1767
+ return DispatchT::Dispatch(
1768
+ d_temp_storage,
1769
+ temp_storage_bytes,
1770
+ d_keys,
1771
+ d_values,
1772
+ num_items,
1773
+ num_segments,
1774
+ d_begin_offsets,
1775
+ d_end_offsets,
1776
+ is_overwrite_okay,
1777
+ stream);
1778
+ }
1779
+
1780
+ public:
1781
+ //! @rst
1782
+ //! Sorts segments of key-value pairs into ascending order.
1783
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1784
+ //!
1785
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1786
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1787
+ //! structure that indicates which of the two buffers is "current" (and thus
1788
+ //! contains the input data to be sorted).
1789
+ //! - The contents of both buffers within each pair may be altered by the sorting
1790
+ //! operation.
1791
+ //! - Upon completion, the sorting operation will update the "current" indicator
1792
+ //! within each DoubleBuffer wrapper to reference which of the two buffers
1793
+ //! now contains the sorted output sequence (a function of the number of key bits
1794
+ //! specified and the targeted device architecture).
1795
+ //! - When the input is a contiguous sequence of segments, a single sequence
1796
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1797
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1798
+ //! the latter is specified as ``segment_offsets + 1``).
1799
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1800
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1801
+ //! guaranteed that the relative order of these two elements will be
1802
+ //! preserved by sort.
1803
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
1804
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
1805
+ //! ``[cur, cur + num_items)`` shall not overlap
1806
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1807
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1808
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1809
+ //! - Segments are not required to be contiguous. For all index values ``i``
1810
+ //! outside the specified segments ``d_keys.Current()[i]``,
1811
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
1812
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
1813
+ //!
1814
+ //! Snippet
1815
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1816
+ //!
1817
+ //! The code snippet below illustrates the batched sorting of three segments
1818
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1819
+ //! ``i`` nt values.
1820
+ //!
1821
+ //! .. code-block:: c++
1822
+ //!
1823
+ //! #include <cub/cub.cuh>
1824
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1825
+ //!
1826
+ //! // Declare, allocate, and initialize device-accessible pointers
1827
+ //! // for sorting data
1828
+ //! int num_items; // e.g., 7
1829
+ //! int num_segments; // e.g., 3
1830
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1831
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1832
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1833
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1834
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
1835
+ //! ...
1836
+ //!
1837
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1838
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1839
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1840
+ //!
1841
+ //! // Determine temporary device storage requirements
1842
+ //! void *d_temp_storage = nullptr;
1843
+ //! size_t temp_storage_bytes = 0;
1844
+ //! cub::DeviceSegmentedSort::SortPairs(
1845
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
1846
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1847
+ //!
1848
+ //! // Allocate temporary storage
1849
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1850
+ //!
1851
+ //! // Run sorting operation
1852
+ //! cub::DeviceSegmentedSort::SortPairs(
1853
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
1854
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1855
+ //!
1856
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
1857
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
1858
+ //!
1859
+ //! @endrst
1860
+ //!
1861
+ //! @tparam KeyT
1862
+ //! **[inferred]** Key type
1863
+ //!
1864
+ //! @tparam ValueT
1865
+ //! **[inferred]** Value type
1866
+ //!
1867
+ //! @tparam BeginOffsetIteratorT
1868
+ //! **[inferred]** Random-access input iterator type for reading segment
1869
+ //! beginning offsets @iterator
1870
+ //!
1871
+ //! @tparam EndOffsetIteratorT
1872
+ //! **[inferred]** Random-access input iterator type for reading segment
1873
+ //! ending offsets @iterator
1874
+ //!
1875
+ //! @param[in] d_temp_storage
1876
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1877
+ //! required allocation size is written to `temp_storage_bytes` and no work
1878
+ //! is done.
1879
+ //!
1880
+ //! @param[in,out] temp_storage_bytes
1881
+ //! Reference to size in bytes of `d_temp_storage` allocation
1882
+ //!
1883
+ //! @param[in,out] d_keys
1884
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1885
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1886
+ //! point to the sorted output keys
1887
+ //!
1888
+ //! @param[in,out] d_values
1889
+ //! Double-buffer of values whose "current" device-accessible buffer contains
1890
+ //! the unsorted input values and, upon return, is updated to point to the
1891
+ //! sorted output values
1892
+ //!
1893
+ //! @param[in] num_items
1894
+ //! The total number of items to sort (across all segments)
1895
+ //!
1896
+ //! @param[in] num_segments
1897
+ //! The number of segments that comprise the sorting data
1898
+ //!
1899
+ //! @param[in] d_begin_offsets
1900
+ //! @rst
1901
+ //! Random-access input iterator to the sequence of beginning offsets of
1902
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1903
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1904
+ //! @endrst
1905
+ //!
1906
+ //! @param[in] d_end_offsets
1907
+ //! @rst
1908
+ //! Random-access input iterator to the sequence of ending offsets of length
1909
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1910
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1911
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
1912
+ //! considered empty.
1913
+ //! @endrst
1914
+ //!
1915
+ //! @param[in] stream
1916
+ //! @rst
1917
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1918
+ //! @endrst
1919
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1920
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
1921
+ void* d_temp_storage,
1922
+ size_t& temp_storage_bytes,
1923
+ DoubleBuffer<KeyT>& d_keys,
1924
+ DoubleBuffer<ValueT>& d_values,
1925
+ ::cuda::std::int64_t num_items,
1926
+ ::cuda::std::int64_t num_segments,
1927
+ BeginOffsetIteratorT d_begin_offsets,
1928
+ EndOffsetIteratorT d_end_offsets,
1929
+ cudaStream_t stream = 0)
1930
+ {
1931
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1932
+ return SortPairsNoNVTX(
1933
+ d_temp_storage,
1934
+ temp_storage_bytes,
1935
+ d_keys,
1936
+ d_values,
1937
+ num_items,
1938
+ num_segments,
1939
+ d_begin_offsets,
1940
+ d_end_offsets,
1941
+ stream);
1942
+ }
1943
+
1944
+ private:
1945
+ // Internal version without NVTX range
1946
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1947
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
1948
+ void* d_temp_storage,
1949
+ size_t& temp_storage_bytes,
1950
+ DoubleBuffer<KeyT>& d_keys,
1951
+ DoubleBuffer<ValueT>& d_values,
1952
+ ::cuda::std::int64_t num_items,
1953
+ ::cuda::std::int64_t num_segments,
1954
+ BeginOffsetIteratorT d_begin_offsets,
1955
+ EndOffsetIteratorT d_end_offsets,
1956
+ cudaStream_t stream = 0)
1957
+ {
1958
+ constexpr bool is_overwrite_okay = true;
1959
+
1960
+ using OffsetT =
1961
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1962
+ using DispatchT =
1963
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1964
+
1965
+ return DispatchT::Dispatch(
1966
+ d_temp_storage,
1967
+ temp_storage_bytes,
1968
+ d_keys,
1969
+ d_values,
1970
+ num_items,
1971
+ num_segments,
1972
+ d_begin_offsets,
1973
+ d_end_offsets,
1974
+ is_overwrite_okay,
1975
+ stream);
1976
+ }
1977
+
1978
+ public:
1979
+ //! @rst
1980
+ //! Sorts segments of key-value pairs into descending order.
1981
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1982
+ //!
1983
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1984
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1985
+ //! structure that indicates which of the two buffers is "current" (and thus
1986
+ //! contains the input data to be sorted).
1987
+ //! - The contents of both buffers within each pair may be altered by the
1988
+ //! sorting operation.
1989
+ //! - Upon completion, the sorting operation will update the "current"
1990
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
1991
+ //! buffers now contains the sorted output sequence (a function of the number
1992
+ //! of key bits specified and the targeted device architecture).
1993
+ //! - When the input is a contiguous sequence of segments, a single sequence
1994
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1995
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1996
+ //! the latter is specified as ``segment_offsets + 1``).
1997
+ //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that
1998
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
1999
+ //! not guaranteed that the relative order of these two elements will be
2000
+ //! preserved by sort.
2001
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2002
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2003
+ //! ``[cur, cur + num_items)`` shall not overlap
2004
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2005
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2006
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2007
+ //! - Segments are not required to be contiguous. For all index values ``i``
2008
+ //! outside the specified segments ``d_keys.Current()[i]``,
2009
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2010
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2011
+ //!
2012
+ //! Snippet
2013
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2014
+ //!
2015
+ //! The code snippet below illustrates the batched sorting of three segments
2016
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2017
+ //! ``i`` nt values.
2018
+ //!
2019
+ //! .. code-block:: c++
2020
+ //!
2021
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2022
+ //!
2023
+ //! // Declare, allocate, and initialize device-accessible pointers for
2024
+ //! // sorting data
2025
+ //! int num_items; // e.g., 7
2026
+ //! int num_segments; // e.g., 3
2027
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2028
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2029
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2030
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2031
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2032
+ //! ...
2033
+ //!
2034
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2035
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2036
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2037
+ //!
2038
+ //! // Determine temporary device storage requirements
2039
+ //! void *d_temp_storage = nullptr;
2040
+ //! size_t temp_storage_bytes = 0;
2041
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
2042
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2043
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2044
+ //!
2045
+ //! // Allocate temporary storage
2046
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2047
+ //!
2048
+ //! // Run sorting operation
2049
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
2050
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2051
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2052
+ //!
2053
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
2054
+ //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
2055
+ //!
2056
+ //! @endrst
2057
+ //!
2058
+ //! @tparam KeyT
2059
+ //! **[inferred]** Key type
2060
+ //!
2061
+ //! @tparam ValueT
2062
+ //! **[inferred]** Value type
2063
+ //!
2064
+ //! @tparam BeginOffsetIteratorT
2065
+ //! **[inferred]** Random-access input iterator type for reading segment
2066
+ //! beginning offsets @iterator
2067
+ //!
2068
+ //! @tparam EndOffsetIteratorT
2069
+ //! **[inferred]** Random-access input iterator type for reading segment
2070
+ //! ending offsets @iterator
2071
+ //!
2072
+ //! @param[in] d_temp_storage
2073
+ //! Device-accessible allocation of temporary storage. When nullptr, the
2074
+ //! required allocation size is written to `temp_storage_bytes` and no work
2075
+ //! is done
2076
+ //!
2077
+ //! @param[in,out] temp_storage_bytes
2078
+ //! Reference to size in bytes of `d_temp_storage` allocation
2079
+ //!
2080
+ //! @param[in,out] d_keys
2081
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2082
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2083
+ //! point to the sorted output keys
2084
+ //!
2085
+ //! @param[in,out] d_values
2086
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2087
+ //! the unsorted input values and, upon return, is updated to point to the
2088
+ //! sorted output values
2089
+ //!
2090
+ //! @param[in] num_items
2091
+ //! The total number of items to sort (across all segments)
2092
+ //!
2093
+ //! @param[in] num_segments
2094
+ //! The number of segments that comprise the sorting data
2095
+ //!
2096
+ //! @param[in] d_begin_offsets
2097
+ //! @rst
2098
+ //! Random-access input iterator to the sequence of beginning offsets of
2099
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2100
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2101
+ //! @endrst
2102
+ //!
2103
+ //! @param[in] d_end_offsets
2104
+ //! @rst
2105
+ //! Random-access input iterator to the sequence of ending offsets of length
2106
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2107
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2108
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2109
+ //! considered empty.
2110
+ //! @endrst
2111
+ //!
2112
+ //! @param[in] stream
2113
+ //! @rst
2114
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2115
+ //! @endrst
2116
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2117
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
2118
+ void* d_temp_storage,
2119
+ size_t& temp_storage_bytes,
2120
+ DoubleBuffer<KeyT>& d_keys,
2121
+ DoubleBuffer<ValueT>& d_values,
2122
+ ::cuda::std::int64_t num_items,
2123
+ ::cuda::std::int64_t num_segments,
2124
+ BeginOffsetIteratorT d_begin_offsets,
2125
+ EndOffsetIteratorT d_end_offsets,
2126
+ cudaStream_t stream = 0)
2127
+ {
2128
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2129
+ return SortPairsDescendingNoNVTX(
2130
+ d_temp_storage,
2131
+ temp_storage_bytes,
2132
+ d_keys,
2133
+ d_values,
2134
+ num_items,
2135
+ num_segments,
2136
+ d_begin_offsets,
2137
+ d_end_offsets,
2138
+ stream);
2139
+ }
2140
+
2141
+ //! @rst
2142
+ //! Sorts segments of key-value pairs into ascending order.
2143
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
2144
+ //!
2145
+ //! - The contents of the input data are not altered by the sorting operation.
2146
+ //! - When the input is a contiguous sequence of segments, a single sequence
2147
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2148
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2149
+ //! the latter is specified as ``segment_offsets + 1``).
2150
+ //! - StableSortPairs is stable: it preserves the relative ordering of
2151
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
2152
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2153
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2154
+ //! ``x`` still precedes ``y``.
2155
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
2156
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
2157
+ //! not overlap ``[in, in + num_items)``,
2158
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2159
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2160
+ //! - Segments are not required to be contiguous. For all index values ``i``
2161
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
2162
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
2163
+ //!
2164
+ //! Snippet
2165
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2166
+ //!
2167
+ //! The code snippet below illustrates the batched sorting of three segments
2168
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2169
+ //! ``i`` nt values.
2170
+ //!
2171
+ //! .. code-block:: c++
2172
+ //!
2173
+ //! #include <cub/cub.cuh>
2174
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
2175
+ //!
2176
+ //! // Declare, allocate, and initialize device-accessible pointers
2177
+ //! // for sorting data
2178
+ //! int num_items; // e.g., 7
2179
+ //! int num_segments; // e.g., 3
2180
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2181
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2182
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
2183
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
2184
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
2185
+ //! ...
2186
+ //!
2187
+ //! // Determine temporary device storage requirements
2188
+ //! void *d_temp_storage = nullptr;
2189
+ //! size_t temp_storage_bytes = 0;
2190
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2191
+ //! d_temp_storage, temp_storage_bytes,
2192
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2193
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2194
+ //!
2195
+ //! // Allocate temporary storage
2196
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2197
+ //!
2198
+ //! // Run sorting operation
2199
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2200
+ //! d_temp_storage, temp_storage_bytes,
2201
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2202
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2203
+ //!
2204
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
2205
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
2206
+ //!
2207
+ //! @endrst
2208
+ //!
2209
+ //! @tparam KeyT
2210
+ //! **[inferred]** Key type
2211
+ //!
2212
+ //! @tparam ValueT
2213
+ //! **[inferred]** Value type
2214
+ //!
2215
+ //! @tparam BeginOffsetIteratorT
2216
+ //! **[inferred]** Random-access input iterator type for reading segment
2217
+ //! beginning offsets @iterator
2218
+ //!
2219
+ //! @tparam EndOffsetIteratorT
2220
+ //! **[inferred]** Random-access input iterator type for reading segment
2221
+ //! ending offsets @iterator
2222
+ //!
2223
+ //! @param[in] d_temp_storage
2224
+ //! Device-accessible allocation of temporary storage. When nullptr, the
2225
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2226
+ //!
2227
+ //! @param[in,out] temp_storage_bytes
2228
+ //! Reference to size in bytes of `d_temp_storage` allocation
2229
+ //!
2230
+ //! @param[in] d_keys_in
2231
+ //! Device-accessible pointer to the input data of key data to sort
2232
+ //!
2233
+ //! @param[out] d_keys_out
2234
+ //! Device-accessible pointer to the sorted output sequence of key data
2235
+ //!
2236
+ //! @param[in] d_values_in
2237
+ //! Device-accessible pointer to the corresponding input sequence of
2238
+ //! associated value items
2239
+ //!
2240
+ //! @param[out] d_values_out
2241
+ //! Device-accessible pointer to the correspondingly-reordered output
2242
+ //! sequence of associated value items
2243
+ //!
2244
+ //! @param[in] num_items
2245
+ //! The total number of items to sort (across all segments)
2246
+ //!
2247
+ //! @param[in] num_segments
2248
+ //! The number of segments that comprise the sorting data
2249
+ //!
2250
+ //! @param[in] d_begin_offsets
2251
+ //! @rst
2252
+ //! Random-access input iterator to the sequence of beginning offsets of
2253
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2254
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2255
+ //! @endrst
2256
+ //!
2257
+ //! @param[in] d_end_offsets
2258
+ //! @rst
2259
+ //! Random-access input iterator to the sequence of ending offsets of length
2260
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2261
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2262
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2263
+ //! considered empty.
2264
+ //! @endrst
2265
+ //!
2266
+ //! @param[in] stream
2267
+ //! @rst
2268
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2269
+ //! @endrst
2270
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2271
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
2272
+ void* d_temp_storage,
2273
+ size_t& temp_storage_bytes,
2274
+ const KeyT* d_keys_in,
2275
+ KeyT* d_keys_out,
2276
+ const ValueT* d_values_in,
2277
+ ValueT* d_values_out,
2278
+ ::cuda::std::int64_t num_items,
2279
+ ::cuda::std::int64_t num_segments,
2280
+ BeginOffsetIteratorT d_begin_offsets,
2281
+ EndOffsetIteratorT d_end_offsets,
2282
+ cudaStream_t stream = 0)
2283
+ {
2284
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2285
+ return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2286
+ d_temp_storage,
2287
+ temp_storage_bytes,
2288
+ d_keys_in,
2289
+ d_keys_out,
2290
+ d_values_in,
2291
+ d_values_out,
2292
+ num_items,
2293
+ num_segments,
2294
+ d_begin_offsets,
2295
+ d_end_offsets,
2296
+ stream);
2297
+ }
2298
+
2299
+ //! @rst
2300
+ //! Sorts segments of key-value pairs into descending order.
2301
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
2302
+ //!
2303
+ //! - The contents of the input data are not altered by the sorting operation.
2304
+ //! - When the input is a contiguous sequence of segments, a single sequence
2305
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2306
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2307
+ //! the latter is specified as ``segment_offsets + 1``).
2308
+ //! - StableSortPairsDescending is stable: it preserves the relative ordering
2309
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2310
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2311
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2312
+ //! ``x`` still precedes ``y``.
2313
+ //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
2314
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
2315
+ //! not overlap ``[in, in + num_items)``,
2316
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2317
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2318
+ //! - Segments are not required to be contiguous. For all index values ``i``
2319
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
2320
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
2321
+ //!
2322
+ //! Snippet
2323
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2324
+ //!
2325
+ //! The code snippet below illustrates the batched sorting of three segments
2326
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2327
+ //! ``i`` nt values.
2328
+ //!
2329
+ //! .. code-block:: c++
2330
+ //!
2331
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2332
+ //!
2333
+ //! // Declare, allocate, and initialize device-accessible pointers
2334
+ //! // for sorting data
2335
+ //! int num_items; // e.g., 7
2336
+ //! int num_segments; // e.g., 3
2337
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2338
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2339
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
2340
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
2341
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
2342
+ //! ...
2343
+ //!
2344
+ //! // Determine temporary device storage requirements
2345
+ //! void *d_temp_storage = nullptr;
2346
+ //! size_t temp_storage_bytes = 0;
2347
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2348
+ //! d_temp_storage, temp_storage_bytes,
2349
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2350
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2351
+ //!
2352
+ //! // Allocate temporary storage
2353
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2354
+ //!
2355
+ //! // Run sorting operation
2356
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2357
+ //! d_temp_storage, temp_storage_bytes,
2358
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2359
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2360
+ //!
2361
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
2362
+ //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
2363
+ //!
2364
+ //! @endrst
2365
+ //!
2366
+ //! @tparam KeyT
2367
+ //! **[inferred]** Key type
2368
+ //!
2369
+ //! @tparam ValueT
2370
+ //! **[inferred]** Value type
2371
+ //!
2372
+ //! @tparam BeginOffsetIteratorT
2373
+ //! **[inferred]** Random-access input iterator type for reading segment
2374
+ //! beginning offsets @iterator
2375
+ //!
2376
+ //! @tparam EndOffsetIteratorT
2377
+ //! **[inferred]** Random-access input iterator type for reading segment
2378
+ //! ending offsets @iterator
2379
+ //!
2380
+ //! @param[in] d_temp_storage
2381
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2382
+ //! required allocation size is written to `temp_storage_bytes` and no work
2383
+ //! is done
2384
+ //!
2385
+ //! @param[in,out] temp_storage_bytes
2386
+ //! Reference to size in bytes of `d_temp_storage` allocation
2387
+ //!
2388
+ //! @param[in] d_keys_in
2389
+ //! Device-accessible pointer to the input data of key data to sort
2390
+ //!
2391
+ //! @param[out] d_keys_out
2392
+ //! Device-accessible pointer to the sorted output sequence of key data
2393
+ //!
2394
+ //! @param[in] d_values_in
2395
+ //! Device-accessible pointer to the corresponding input sequence of
2396
+ //! associated value items
2397
+ //!
2398
+ //! @param[out] d_values_out
2399
+ //! Device-accessible pointer to the correspondingly-reordered output
2400
+ //! sequence of associated value items
2401
+ //!
2402
+ //! @param[in] num_items
2403
+ //! The total number of items to sort (across all segments)
2404
+ //!
2405
+ //! @param[in] num_segments
2406
+ //! The number of segments that comprise the sorting data
2407
+ //!
2408
+ //! @param[in] d_begin_offsets
2409
+ //! @rst
2410
+ //! Random-access input iterator to the sequence of beginning offsets of
2411
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2412
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2413
+ //! @endrst
2414
+ //!
2415
+ //! @param[in] d_end_offsets
2416
+ //! @rst
2417
+ //! Random-access input iterator to the sequence of ending offsets of length
2418
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2419
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2420
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2421
+ //! considered empty.
2422
+ //! @endrst
2423
+ //!
2424
+ //! @param[in] stream
2425
+ //! @rst
2426
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2427
+ //! @endrst
2428
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2429
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
2430
+ void* d_temp_storage,
2431
+ size_t& temp_storage_bytes,
2432
+ const KeyT* d_keys_in,
2433
+ KeyT* d_keys_out,
2434
+ const ValueT* d_values_in,
2435
+ ValueT* d_values_out,
2436
+ ::cuda::std::int64_t num_items,
2437
+ ::cuda::std::int64_t num_segments,
2438
+ BeginOffsetIteratorT d_begin_offsets,
2439
+ EndOffsetIteratorT d_end_offsets,
2440
+ cudaStream_t stream = 0)
2441
+ {
2442
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2443
+ return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2444
+ d_temp_storage,
2445
+ temp_storage_bytes,
2446
+ d_keys_in,
2447
+ d_keys_out,
2448
+ d_values_in,
2449
+ d_values_out,
2450
+ num_items,
2451
+ num_segments,
2452
+ d_begin_offsets,
2453
+ d_end_offsets,
2454
+ stream);
2455
+ }
2456
+
2457
+ //! @rst
2458
+ //! Sorts segments of key-value pairs into ascending order.
2459
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
2460
+ //!
2461
+ //! - The sorting operation is given a pair of key buffers and a corresponding
2462
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
2463
+ //! structure that indicates which of the two buffers is "current" (and thus
2464
+ //! contains the input data to be sorted).
2465
+ //! - The contents of both buffers within each pair may be altered by the
2466
+ //! sorting operation.
2467
+ //! - Upon completion, the sorting operation will update the "current"
2468
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
2469
+ //! buffers now contains the sorted output sequence (a function of the number
2470
+ //! of key bits specified and the targeted device architecture).
2471
+ //! - When the input is a contiguous sequence of segments, a single sequence
2472
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2473
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2474
+ //! the latter is specified as ``segment_offsets + 1``).
2475
+ //! - StableSortPairs is stable: it preserves the relative ordering
2476
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2477
+ //! ``x`` precedes `y`, and if the two elements are equivalent (neither
2478
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2479
+ //! ``x`` still precedes ``y``.
2480
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2481
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2482
+ //! ``[cur, cur + num_items)`` shall not overlap
2483
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2484
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2485
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2486
+ //! - Segments are not required to be contiguous. For all index values ``i``
2487
+ //! outside the specified segments ``d_keys.Current()[i]``,
2488
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2489
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2490
+ //!
2491
+ //! Snippet
2492
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2493
+ //!
2494
+ //! The code snippet below illustrates the batched sorting of three segments
2495
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2496
+ //! ``i`` nt values.
2497
+ //!
2498
+ //! .. code-block:: c++
2499
+ //!
2500
+ //! #include <cub/cub.cuh>
2501
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
2502
+ //!
2503
+ //! // Declare, allocate, and initialize device-accessible pointers
2504
+ //! // for sorting data
2505
+ //! int num_items; // e.g., 7
2506
+ //! int num_segments; // e.g., 3
2507
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2508
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2509
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2510
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2511
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2512
+ //! ...
2513
+ //!
2514
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2515
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2516
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2517
+ //!
2518
+ //! // Determine temporary device storage requirements
2519
+ //! void *d_temp_storage = nullptr;
2520
+ //! size_t temp_storage_bytes = 0;
2521
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2522
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2523
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2524
+ //!
2525
+ //! // Allocate temporary storage
2526
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2527
+ //!
2528
+ //! // Run sorting operation
2529
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2530
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2531
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2532
+ //!
2533
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
2534
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
2535
+ //!
2536
+ //! @endrst
2537
+ //!
2538
+ //! @tparam KeyT
2539
+ //! **[inferred]** Key type
2540
+ //!
2541
+ //! @tparam ValueT
2542
+ //! **[inferred]** Value type
2543
+ //!
2544
+ //! @tparam BeginOffsetIteratorT
2545
+ //! **[inferred]** Random-access input iterator type for reading segment
2546
+ //! beginning offsets @iterator
2547
+ //!
2548
+ //! @tparam EndOffsetIteratorT
2549
+ //! **[inferred]** Random-access input iterator type for reading segment
2550
+ //! ending offsets @iterator
2551
+ //!
2552
+ //! @param[in] d_temp_storage
2553
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2554
+ //! required allocation size is written to `temp_storage_bytes` and no work
2555
+ //! is done
2556
+ //!
2557
+ //! @param[in,out] temp_storage_bytes
2558
+ //! Reference to size in bytes of `d_temp_storage` allocation
2559
+ //!
2560
+ //! @param[in,out] d_keys
2561
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2562
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2563
+ //! point to the sorted output keys
2564
+ //!
2565
+ //! @param[in,out] d_values
2566
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2567
+ //! the unsorted input values and, upon return, is updated to point to the
2568
+ //! sorted output values
2569
+ //!
2570
+ //! @param[in] num_items
2571
+ //! The total number of items to sort (across all segments)
2572
+ //!
2573
+ //! @param[in] num_segments
2574
+ //! The number of segments that comprise the sorting data
2575
+ //!
2576
+ //! @param[in] d_begin_offsets
2577
+ //! @rst
2578
+ //! Random-access input iterator to the sequence of beginning offsets of
2579
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2580
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2581
+ //! @endrst
2582
+ //!
2583
+ //! @param[in] d_end_offsets
2584
+ //! @rst
2585
+ //! Random-access input iterator to the sequence of ending offsets of length
2586
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2587
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2588
+ //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
2589
+ //! considered empty.
2590
+ //! @endrst
2591
+ //!
2592
+ //! @param[in] stream
2593
+ //! @rst
2594
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2595
+ //! @endrst
2596
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2597
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
2598
+ void* d_temp_storage,
2599
+ size_t& temp_storage_bytes,
2600
+ DoubleBuffer<KeyT>& d_keys,
2601
+ DoubleBuffer<ValueT>& d_values,
2602
+ ::cuda::std::int64_t num_items,
2603
+ ::cuda::std::int64_t num_segments,
2604
+ BeginOffsetIteratorT d_begin_offsets,
2605
+ EndOffsetIteratorT d_end_offsets,
2606
+ cudaStream_t stream = 0)
2607
+ {
2608
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2609
+ return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2610
+ d_temp_storage,
2611
+ temp_storage_bytes,
2612
+ d_keys,
2613
+ d_values,
2614
+ num_items,
2615
+ num_segments,
2616
+ d_begin_offsets,
2617
+ d_end_offsets,
2618
+ stream);
2619
+ }
2620
+
2621
+ //! @rst
2622
+ //! Sorts segments of key-value pairs into descending order.
2623
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
2624
+ //!
2625
+ //! - The sorting operation is given a pair of key buffers and a corresponding
2626
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
2627
+ //! structure that indicates which of the two buffers is "current" (and thus
2628
+ //! contains the input data to be sorted).
2629
+ //! - The contents of both buffers within each pair may be altered by the sorting
2630
+ //! operation.
2631
+ //! - Upon completion, the sorting operation will update the "current" indicator
2632
+ //! within each DoubleBuffer wrapper to reference which of the two buffers
2633
+ //! now contains the sorted output sequence (a function of the number of key bits
2634
+ //! specified and the targeted device architecture).
2635
+ //! - When the input is a contiguous sequence of segments, a single sequence
2636
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2637
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2638
+ //! the latter is specified as ``segment_offsets + 1``).
2639
+ //! - StableSortPairsDescending is stable: it preserves the relative ordering
2640
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2641
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2642
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2643
+ //! ``x`` still precedes ``y``.
2644
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2645
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2646
+ //! ``[cur, cur + num_items)`` shall not overlap
2647
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2648
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2649
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2650
+ //! - Segments are not required to be contiguous. For all index values ``i``
2651
+ //! outside the specified segments ``d_keys.Current()[i]``,
2652
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2653
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2654
+ //!
2655
+ //! Snippet
2656
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2657
+ //!
2658
+ //! The code snippet below illustrates the batched sorting of three segments
2659
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2660
+ //! ``i`` nt values.
2661
+ //!
2662
+ //! .. code-block:: c++
2663
+ //!
2664
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2665
+ //!
2666
+ //! // Declare, allocate, and initialize device-accessible pointers
2667
+ //! // for sorting data
2668
+ //! int num_items; // e.g., 7
2669
+ //! int num_segments; // e.g., 3
2670
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2671
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2672
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2673
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2674
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2675
+ //! ...
2676
+ //!
2677
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2678
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2679
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2680
+ //!
2681
+ //! // Determine temporary device storage requirements
2682
+ //! void *d_temp_storage = nullptr;
2683
+ //! size_t temp_storage_bytes = 0;
2684
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2685
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2686
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2687
+ //!
2688
+ //! // Allocate temporary storage
2689
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2690
+ //!
2691
+ //! // Run sorting operation
2692
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2693
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2694
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2695
+ //!
2696
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
2697
+ //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
2698
+ //!
2699
+ //! @endrst
2700
+ //!
2701
+ //! @tparam KeyT
2702
+ //! **[inferred]** Key type
2703
+ //!
2704
+ //! @tparam ValueT
2705
+ //! **[inferred]** Value type
2706
+ //!
2707
+ //! @tparam BeginOffsetIteratorT
2708
+ //! **[inferred]** Random-access input iterator type for reading segment
2709
+ //! beginning offsets @iterator
2710
+ //!
2711
+ //! @tparam EndOffsetIteratorT
2712
+ //! **[inferred]** Random-access input iterator type for reading segment
2713
+ //! ending offsets @iterator
2714
+ //!
2715
+ //! @param[in] d_temp_storage
2716
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2717
+ //! required allocation size is written to `temp_storage_bytes` and no work
2718
+ //! is done
2719
+ //!
2720
+ //! @param[in,out] temp_storage_bytes
2721
+ //! Reference to size in bytes of `d_temp_storage` allocation
2722
+ //!
2723
+ //! @param[in,out] d_keys
2724
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2725
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2726
+ //! point to the sorted output keys
2727
+ //!
2728
+ //! @param[in,out] d_values
2729
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2730
+ //! the unsorted input values and, upon return, is updated to point to the
2731
+ //! sorted output values
2732
+ //!
2733
+ //! @param[in] num_items
2734
+ //! The total number of items to sort (across all segments)
2735
+ //!
2736
+ //! @param[in] num_segments
2737
+ //! The number of segments that comprise the sorting data
2738
+ //!
2739
+ //! @param[in] d_begin_offsets
2740
+ //! @rst
2741
+ //! Random-access input iterator to the sequence of beginning offsets of
2742
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2743
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2744
+ //! @endrst
2745
+ //!
2746
+ //! @param[in] d_end_offsets
2747
+ //! @rst
2748
+ //! Random-access input iterator to the sequence of ending offsets of length
2749
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2750
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2751
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2752
+ //! considered empty.
2753
+ //! @endrst
2754
+ //!
2755
+ //! @param[in] stream
2756
+ //! @rst
2757
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2758
+ //! @endrst
2759
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2760
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
2761
+ void* d_temp_storage,
2762
+ size_t& temp_storage_bytes,
2763
+ DoubleBuffer<KeyT>& d_keys,
2764
+ DoubleBuffer<ValueT>& d_values,
2765
+ ::cuda::std::int64_t num_items,
2766
+ ::cuda::std::int64_t num_segments,
2767
+ BeginOffsetIteratorT d_begin_offsets,
2768
+ EndOffsetIteratorT d_end_offsets,
2769
+ cudaStream_t stream = 0)
2770
+ {
2771
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2772
+ return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2773
+ d_temp_storage,
2774
+ temp_storage_bytes,
2775
+ d_keys,
2776
+ d_values,
2777
+ num_items,
2778
+ num_segments,
2779
+ d_begin_offsets,
2780
+ d_end_offsets,
2781
+ stream);
2782
+ }
2783
+
2784
+ //! @} end member group
2785
+ };
2786
+
2787
+ CUB_NAMESPACE_END