cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2811 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions are met:
6
+ * * Redistributions of source code must retain the above copyright
7
+ * notice, this list of conditions and the following disclaimer.
8
+ * * Redistributions in binary form must reproduce the above copyright
9
+ * notice, this list of conditions and the following disclaimer in the
10
+ * documentation and/or other materials provided with the distribution.
11
+ * * Neither the name of the NVIDIA CORPORATION nor the
12
+ * names of its contributors may be used to endorse or promote products
13
+ * derived from this software without specific prior written permission.
14
+ *
15
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+ *
26
+ ******************************************************************************/
27
+
28
+ //! @file
29
+ //! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple,
30
+ //! non-overlapping sequences of data items residing within device-accessible memory.
31
+
32
+ #pragma once
33
+
34
+ #include <cub/config.cuh>
35
+
36
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
37
+ # pragma GCC system_header
38
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
39
+ # pragma clang system_header
40
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
41
+ # pragma system_header
42
+ #endif // no system header
43
+
44
+ #include <cub/detail/choose_offset.cuh>
45
+ #include <cub/device/dispatch/dispatch_segmented_sort.cuh>
46
+ #include <cub/util_namespace.cuh>
47
+
48
+ #include <cuda/std/cstdint>
49
+
50
+ CUB_NAMESPACE_BEGIN
51
+
52
+ //! @rst
53
+ //! DeviceSegmentedSort provides device-wide, parallel operations for
54
+ //! computing a batched sort across multiple, non-overlapping sequences of
55
+ //! data items residing within device-accessible memory.
56
+ //!
57
+ //! Overview
58
+ //! +++++++++++++++++++++++++++++++++++++++++++++
59
+ //!
60
+ //! The algorithm arranges items into ascending (or descending) order.
61
+ //! The underlying sorting algorithm is undefined. Depending on the segment size,
62
+ //! it might be radix sort, merge sort or something else. Therefore, no
63
+ //! assumptions on the underlying implementation should be made.
64
+ //!
65
+ //! Differences from DeviceSegmentedRadixSort
66
+ //! +++++++++++++++++++++++++++++++++++++++++++++
67
+ //!
68
+ //! DeviceSegmentedRadixSort is optimized for significantly large segments (tens
69
+ //! of thousands of items and more). Nevertheless, some domains produce a wide
70
+ //! range of segment sizes. DeviceSegmentedSort partitions segments into size
71
+ //! groups and specialize sorting algorithms for each group. This approach leads
72
+ //! to better resource utilization in the presence of segment size imbalance or
73
+ //! moderate segment sizes (up to thousands of items).
74
+ //! This algorithm is more complex and consists of multiple kernels. This fact
75
+ //! leads to longer compilation times as well as larger binaries sizes.
76
+ //!
77
+ //! Supported Types
78
+ //! +++++++++++++++++++++++++++++++++++++++++++++
79
+ //!
80
+ //! The algorithm has to satisfy the underlying algorithms restrictions. Radix
81
+ //! sort usage restricts the list of supported types. Therefore,
82
+ //! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
83
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and
84
+ //! ``__nv_bfloat16`` 16-bit floating-point types.
85
+ //!
86
+ //! Segments are not required to be contiguous. Any element of input(s) or
87
+ //! output(s) outside the specified segments will not be accessed nor modified.
88
+ //!
89
+ //! A simple example
90
+ //! +++++++++++++++++++++++++++++++++++++++++++++
91
+ //!
92
+ //! .. code-block:: c++
93
+ //!
94
+ //! #include <cub/cub.cuh>
95
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
96
+ //!
97
+ //! // Declare, allocate, and initialize device-accessible pointers
98
+ //! // for sorting data
99
+ //! int num_items; // e.g., 7
100
+ //! int num_segments; // e.g., 3
101
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
102
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
103
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
104
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
105
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
106
+ //! ...
107
+ //!
108
+ //! // Determine temporary device storage requirements
109
+ //! void *d_temp_storage = nullptr;
110
+ //! size_t temp_storage_bytes = 0;
111
+ //! cub::DeviceSegmentedSort::SortPairs(
112
+ //! d_temp_storage, temp_storage_bytes,
113
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
114
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
115
+ //!
116
+ //! // Allocate temporary storage
117
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
118
+ //!
119
+ //! // Run sorting operation
120
+ //! cub::DeviceSegmentedSort::SortPairs(
121
+ //! d_temp_storage, temp_storage_bytes,
122
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
123
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
124
+ //!
125
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
126
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
127
+ //!
128
+ //! @endrst
129
+ struct DeviceSegmentedSort
130
+ {
131
+ private:
132
+ // Name reported for NVTX ranges
133
+ _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
134
+ {
135
+ return "cub::DeviceSegmentedSort";
136
+ }
137
+
138
+ // Internal version without NVTX range
139
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
140
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
141
+ void* d_temp_storage,
142
+ size_t& temp_storage_bytes,
143
+ const KeyT* d_keys_in,
144
+ KeyT* d_keys_out,
145
+ ::cuda::std::int64_t num_items,
146
+ ::cuda::std::int64_t num_segments,
147
+ BeginOffsetIteratorT d_begin_offsets,
148
+ EndOffsetIteratorT d_end_offsets,
149
+ cudaStream_t stream = 0)
150
+ {
151
+ constexpr bool is_overwrite_okay = false;
152
+
153
+ using OffsetT =
154
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
155
+ using DispatchT =
156
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
157
+
158
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
159
+ DoubleBuffer<NullType> d_values;
160
+
161
+ return DispatchT::Dispatch(
162
+ d_temp_storage,
163
+ temp_storage_bytes,
164
+ d_keys,
165
+ d_values,
166
+ num_items,
167
+ num_segments,
168
+ d_begin_offsets,
169
+ d_end_offsets,
170
+ is_overwrite_okay,
171
+ stream);
172
+ }
173
+
174
+ public:
175
+ //! @name Keys-only
176
+ //! @{
177
+
178
+ //! @rst
179
+ //! Sorts segments of keys into ascending order.
180
+ //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
181
+ //!
182
+ //! - The contents of the input data are not altered by the sorting operation.
183
+ //! - When the input is a contiguous sequence of segments, a single sequence
184
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
185
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
186
+ //! the latter is specified as `segment_offsets+1`).
187
+ //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and
188
+ //! ``j`` are equivalent: neither one is less than the other. It is not
189
+ //! guaranteed that the relative order of these two elements will be
190
+ //! preserved by sort.
191
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
192
+ //! ``[d_keys_in, d_keys_in + num_items)``,
193
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
194
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
195
+ //! - Segments are not required to be contiguous. For all index values ``i``
196
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
197
+ //! be accessed nor modified.
198
+ //!
199
+ //! Snippet
200
+ //! +++++++++++++++++++++++++++++++++++++++++++++
201
+ //!
202
+ //! The code snippet below illustrates the batched sorting of three segments
203
+ //! (with one zero-length segment) of ``int`` keys.
204
+ //!
205
+ //! .. code-block:: c++
206
+ //!
207
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
208
+ //!
209
+ //! // Declare, allocate, and initialize device-accessible
210
+ //! // pointers for sorting data
211
+ //! int num_items; // e.g., 7
212
+ //! int num_segments; // e.g., 3
213
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
214
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
215
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
216
+ //! ...
217
+ //!
218
+ //! // Determine temporary device storage requirements
219
+ //! void *d_temp_storage = nullptr;
220
+ //! size_t temp_storage_bytes = 0;
221
+ //! cub::DeviceSegmentedSort::SortKeys(
222
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
223
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
224
+ //!
225
+ //! // Allocate temporary storage
226
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
227
+ //!
228
+ //! // Run sorting operation
229
+ //! cub::DeviceSegmentedSort::SortKeys(
230
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
231
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
232
+ //!
233
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
234
+ //!
235
+ //! @endrst
236
+ //!
237
+ //! @tparam KeyT
238
+ //! **[inferred]** Key type
239
+ //!
240
+ //! @tparam BeginOffsetIteratorT
241
+ //! **[inferred]** Random-access input iterator type for reading segment
242
+ //! beginning offsets @iterator
243
+ //!
244
+ //! @tparam EndOffsetIteratorT
245
+ //! **[inferred]** Random-access input iterator type for reading segment
246
+ //! ending offsets @iterator
247
+ //!
248
+ //! @param[in] d_temp_storage
249
+ //! Device-accessible allocation of temporary storage. When nullptr, the
250
+ //! required allocation size is written to `temp_storage_bytes` and no work
251
+ //! is done
252
+ //!
253
+ //! @param[in,out] temp_storage_bytes
254
+ //! Reference to size in bytes of `d_temp_storage` allocation
255
+ //!
256
+ //! @param[in] d_keys_in
257
+ //! Device-accessible pointer to the input data of key data to sort
258
+ //!
259
+ //! @param[out] d_keys_out
260
+ //! Device-accessible pointer to the sorted output sequence of key data
261
+ //!
262
+ //! @param[in] num_items
263
+ //! The total number of items to sort (across all segments)
264
+ //!
265
+ //! @param[in] num_segments
266
+ //! The number of segments that comprise the sorting data
267
+ //!
268
+ //! @param[in] d_begin_offsets
269
+ //! @rst
270
+ //! Random-access input iterator to the sequence of beginning offsets of
271
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
272
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
273
+ //! @endrst
274
+ //!
275
+ //! @param[in] d_end_offsets
276
+ //! @rst
277
+ //! Random-access input iterator to the sequence of ending offsets of length
278
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
279
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
280
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
281
+ //! @endrst
282
+ //!
283
+ //! @param[in] stream
284
+ //! @rst
285
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
286
+ //! @endrst
287
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
288
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
289
+ void* d_temp_storage,
290
+ size_t& temp_storage_bytes,
291
+ const KeyT* d_keys_in,
292
+ KeyT* d_keys_out,
293
+ ::cuda::std::int64_t num_items,
294
+ ::cuda::std::int64_t num_segments,
295
+ BeginOffsetIteratorT d_begin_offsets,
296
+ EndOffsetIteratorT d_end_offsets,
297
+ cudaStream_t stream = 0)
298
+ {
299
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
300
+ return SortKeysNoNVTX(
301
+ d_temp_storage,
302
+ temp_storage_bytes,
303
+ d_keys_in,
304
+ d_keys_out,
305
+ num_items,
306
+ num_segments,
307
+ d_begin_offsets,
308
+ d_end_offsets,
309
+ stream);
310
+ }
311
+
312
+ private:
313
+ // Internal version without NVTX range
314
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
315
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
316
+ void* d_temp_storage,
317
+ size_t& temp_storage_bytes,
318
+ const KeyT* d_keys_in,
319
+ KeyT* d_keys_out,
320
+ ::cuda::std::int64_t num_items,
321
+ ::cuda::std::int64_t num_segments,
322
+ BeginOffsetIteratorT d_begin_offsets,
323
+ EndOffsetIteratorT d_end_offsets,
324
+ cudaStream_t stream = 0)
325
+ {
326
+ constexpr bool is_overwrite_okay = false;
327
+
328
+ using OffsetT =
329
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
330
+ using DispatchT =
331
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
332
+
333
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
334
+ DoubleBuffer<NullType> d_values;
335
+
336
+ return DispatchT::Dispatch(
337
+ d_temp_storage,
338
+ temp_storage_bytes,
339
+ d_keys,
340
+ d_values,
341
+ num_items,
342
+ num_segments,
343
+ d_begin_offsets,
344
+ d_end_offsets,
345
+ is_overwrite_okay,
346
+ stream);
347
+ }
348
+
349
+ public:
350
+ //! @rst
351
+ //! Sorts segments of keys into descending order. Approximately
352
+ //! ``num_items + 2 * num_segments`` auxiliary storage required.
353
+ //!
354
+ //! - The contents of the input data are not altered by the sorting operation.
355
+ //! - When the input is a contiguous sequence of segments, a single sequence
356
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
357
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
358
+ //! the latter is specified as ``segment_offsets + 1``).
359
+ //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
360
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
361
+ //! not guaranteed that the relative order of these two elements will be
362
+ //! preserved by sort.
363
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
364
+ //! ``[d_keys_in, d_keys_in + num_items)``,
365
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
366
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
367
+ //! - Segments are not required to be contiguous. For all index values ``i``
368
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
369
+ //! be accessed nor modified.
370
+ //!
371
+ //! Snippet
372
+ //! +++++++++++++++++++++++++++++++++++++++++++++
373
+ //!
374
+ //! The code snippet below illustrates the batched sorting of three segments
375
+ //! (with one zero-length segment) of ``i`` nt keys.
376
+ //!
377
+ //! .. code-block:: c++
378
+ //!
379
+ //! #include <cub/cub.cuh>
380
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
381
+ //!
382
+ //! // Declare, allocate, and initialize device-accessible pointers
383
+ //! // for sorting data
384
+ //! int num_items; // e.g., 7
385
+ //! int num_segments; // e.g., 3
386
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
387
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
388
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
389
+ //! ...
390
+ //!
391
+ //! // Determine temporary device storage requirements
392
+ //! void *d_temp_storage = nullptr;
393
+ //! size_t temp_storage_bytes = 0;
394
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
395
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
396
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
397
+ //!
398
+ //! // Allocate temporary storage
399
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
400
+ //!
401
+ //! // Run sorting operation
402
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
403
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
404
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
405
+ //!
406
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
407
+ //!
408
+ //! @endrst
409
+ //!
410
+ //! @tparam KeyT
411
+ //! **[inferred]** Key type
412
+ //!
413
+ //! @tparam BeginOffsetIteratorT
414
+ //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
415
+ //!
416
+ //! @tparam EndOffsetIteratorT
417
+ //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
418
+ //!
419
+ //! @param[in] d_temp_storage
420
+ //! Device-accessible allocation of temporary storage. When nullptr, the
421
+ //! required allocation size is written to `temp_storage_bytes` and no work is done
422
+ //!
423
+ //! @param[in,out] temp_storage_bytes
424
+ //! Reference to size in bytes of `d_temp_storage` allocation
425
+ //!
426
+ //! @param[in] d_keys_in
427
+ //! Device-accessible pointer to the input data of key data to sort
428
+ //!
429
+ //! @param[out] d_keys_out
430
+ //! Device-accessible pointer to the sorted output sequence of key data
431
+ //!
432
+ //! @param[in] num_items
433
+ //! The total number of items to sort (across all segments)
434
+ //!
435
+ //! @param[in] num_segments
436
+ //! The number of segments that comprise the sorting data
437
+ //!
438
+ //! @param[in] d_begin_offsets
439
+ //! @rst
440
+ //! Random-access input iterator to the sequence of beginning offsets of
441
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
442
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
443
+ //! @endrst
444
+ //!
445
+ //! @param[in] d_end_offsets
446
+ //! @rst
447
+ //! Random-access input iterator to the sequence of ending offsets of length
448
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
449
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
450
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
451
+ //! @endrst
452
+ //!
453
+ //! @param[in] stream
454
+ //! @rst
455
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
456
+ //! @endrst
457
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
458
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
459
+ void* d_temp_storage,
460
+ size_t& temp_storage_bytes,
461
+ const KeyT* d_keys_in,
462
+ KeyT* d_keys_out,
463
+ ::cuda::std::int64_t num_items,
464
+ ::cuda::std::int64_t num_segments,
465
+ BeginOffsetIteratorT d_begin_offsets,
466
+ EndOffsetIteratorT d_end_offsets,
467
+ cudaStream_t stream = 0)
468
+ {
469
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
470
+ return SortKeysDescendingNoNVTX(
471
+ d_temp_storage,
472
+ temp_storage_bytes,
473
+ d_keys_in,
474
+ d_keys_out,
475
+ num_items,
476
+ num_segments,
477
+ d_begin_offsets,
478
+ d_end_offsets,
479
+ stream);
480
+ }
481
+
482
+ private:
483
+ // Internal version without NVTX range
484
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
485
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
486
+ void* d_temp_storage,
487
+ size_t& temp_storage_bytes,
488
+ DoubleBuffer<KeyT>& d_keys,
489
+ ::cuda::std::int64_t num_items,
490
+ ::cuda::std::int64_t num_segments,
491
+ BeginOffsetIteratorT d_begin_offsets,
492
+ EndOffsetIteratorT d_end_offsets,
493
+ cudaStream_t stream = 0)
494
+ {
495
+ constexpr bool is_overwrite_okay = true;
496
+ using OffsetT =
497
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
498
+ using DispatchT =
499
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
500
+
501
+ DoubleBuffer<NullType> d_values;
502
+
503
+ return DispatchT::Dispatch(
504
+ d_temp_storage,
505
+ temp_storage_bytes,
506
+ d_keys,
507
+ d_values,
508
+ num_items,
509
+ num_segments,
510
+ d_begin_offsets,
511
+ d_end_offsets,
512
+ is_overwrite_okay,
513
+ stream);
514
+ }
515
+
516
+ public:
517
+ //! @rst
518
+ //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required.
519
+ //!
520
+ //! - The sorting operation is given a pair of key buffers managed by a
521
+ //! DoubleBuffer structure that indicates which of the two buffers is
522
+ //! "current" (and thus contains the input data to be sorted).
523
+ //! - The contents of both buffers may be altered by the sorting operation.
524
+ //! - Upon completion, the sorting operation will update the "current"
525
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
526
+ //! buffers now contains the sorted output sequence (a function of the number
527
+ //! of key bits and the targeted device architecture).
528
+ //! - When the input is a contiguous sequence of segments, a single sequence
529
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
530
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
531
+ //! the latter is specified as ``segment_offsets +1``).
532
+ //! - SortKeys is not guaranteed to be stable. That is, suppose that
533
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
534
+ //! not guaranteed that the relative order of these two elements will be
535
+ //! preserved by sort.
536
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
537
+ //! The range ``[cur, cur + num_items)`` shall not overlap
538
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
539
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
540
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
541
+ //! - Segments are not required to be contiguous. For all index values ``i``
542
+ //! outside the specified segments ``d_keys.Current()[i]``,
543
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
544
+ //!
545
+ //! Snippet
546
+ //! +++++++++++++++++++++++++++++++++++++++++++++
547
+ //!
548
+ //! The code snippet below illustrates the batched sorting of three segments
549
+ //! (with one zero-length segment) of ``i`` nt keys.
550
+ //!
551
+ //! .. code-block:: c++
552
+ //!
553
+ //! #include <cub/cub.cuh>
554
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
555
+ //!
556
+ //! // Declare, allocate, and initialize device-accessible
557
+ //! // pointers for sorting data
558
+ //! int num_items; // e.g., 7
559
+ //! int num_segments; // e.g., 3
560
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
561
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
562
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
563
+ //! ...
564
+ //!
565
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
566
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
567
+ //!
568
+ //! // Determine temporary device storage requirements
569
+ //! void *d_temp_storage = nullptr;
570
+ //! size_t temp_storage_bytes = 0;
571
+ //! cub::DeviceSegmentedSort::SortKeys(
572
+ //! d_temp_storage, temp_storage_bytes, d_keys,
573
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
574
+ //!
575
+ //! // Allocate temporary storage
576
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
577
+ //!
578
+ //! // Run sorting operation
579
+ //! cub::DeviceSegmentedSort::SortKeys(
580
+ //! d_temp_storage, temp_storage_bytes, d_keys,
581
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
582
+ //!
583
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
584
+ //!
585
+ //! @endrst
586
+ //!
587
+ //! @tparam KeyT
588
+ //! **[inferred]** Key type
589
+ //!
590
+ //! @tparam BeginOffsetIteratorT
591
+ //! **[inferred]** Random-access input iterator type for reading segment
592
+ //! beginning offsets @iterator
593
+ //!
594
+ //! @tparam EndOffsetIteratorT
595
+ //! **[inferred]** Random-access input iterator type for reading segment
596
+ //! ending offsets @iterator
597
+ //!
598
+ //! @param[in] d_temp_storage
599
+ //! Device-accessible allocation of temporary storage. When nullptr, the
600
+ //! required allocation size is written to `temp_storage_bytes` and no
601
+ //! work is done
602
+ //!
603
+ //! @param[in,out] temp_storage_bytes
604
+ //! Reference to size in bytes of `d_temp_storage` allocation
605
+ //!
606
+ //! @param[in,out] d_keys
607
+ //! Reference to the double-buffer of keys whose "current" device-accessible
608
+ //! buffer contains the unsorted input keys and, upon return, is updated to
609
+ //! point to the sorted output keys
610
+ //!
611
+ //! @param[in] num_items
612
+ //! The total number of items to sort (across all segments)
613
+ //!
614
+ //! @param[in] num_segments
615
+ //! The number of segments that comprise the sorting data
616
+ //!
617
+ //! @param[in] d_begin_offsets
618
+ //! @rst
619
+ //! Random-access input iterator to the sequence of beginning offsets of
620
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
621
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
622
+ //! @endrst
623
+ //!
624
+ //! @param[in] d_end_offsets
625
+ //! @rst
626
+ //! Random-access input iterator to the sequence of ending offsets of length
627
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
628
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
629
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
630
+ //! @endrst
631
+ //!
632
+ //! @param[in] stream
633
+ //! @rst
634
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
635
+ //! @endrst
636
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
637
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
638
+ void* d_temp_storage,
639
+ size_t& temp_storage_bytes,
640
+ DoubleBuffer<KeyT>& d_keys,
641
+ ::cuda::std::int64_t num_items,
642
+ ::cuda::std::int64_t num_segments,
643
+ BeginOffsetIteratorT d_begin_offsets,
644
+ EndOffsetIteratorT d_end_offsets,
645
+ cudaStream_t stream = 0)
646
+ {
647
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
648
+ return SortKeysNoNVTX(
649
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
650
+ }
651
+
652
+ private:
653
+ // Internal version without NVTX range
654
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
655
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
656
+ void* d_temp_storage,
657
+ size_t& temp_storage_bytes,
658
+ DoubleBuffer<KeyT>& d_keys,
659
+ ::cuda::std::int64_t num_items,
660
+ ::cuda::std::int64_t num_segments,
661
+ BeginOffsetIteratorT d_begin_offsets,
662
+ EndOffsetIteratorT d_end_offsets,
663
+ cudaStream_t stream = 0)
664
+ {
665
+ constexpr bool is_overwrite_okay = true;
666
+ using OffsetT =
667
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
668
+ using DispatchT =
669
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
670
+
671
+ DoubleBuffer<NullType> d_values;
672
+
673
+ return DispatchT::Dispatch(
674
+ d_temp_storage,
675
+ temp_storage_bytes,
676
+ d_keys,
677
+ d_values,
678
+ num_items,
679
+ num_segments,
680
+ d_begin_offsets,
681
+ d_end_offsets,
682
+ is_overwrite_okay,
683
+ stream);
684
+ }
685
+
686
+ public:
687
+ //! @rst
688
+ //! Sorts segments of keys into descending order. Approximately
689
+ //! ``2 * num_segments`` auxiliary storage required.
690
+ //!
691
+ //! - The sorting operation is given a pair of key buffers managed by a
692
+ //! DoubleBuffer structure that indicates which of the two buffers is
693
+ //! "current" (and thus contains the input data to be sorted).
694
+ //! - The contents of both buffers may be altered by the sorting operation.
695
+ //! - Upon completion, the sorting operation will update the "current"
696
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
697
+ //! buffers now contains the sorted output sequence (a function of the number
698
+ //! of key bits and the targeted device architecture).
699
+ //! - When the input is a contiguous sequence of segments, a single sequence
700
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
701
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
702
+ //! the latter is specified as ``segment_offsets + 1``).
703
+ //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
704
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
705
+ //! not guaranteed that the relative order of these two elements will be
706
+ //! preserved by sort.
707
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
708
+ //! The range ``[cur, cur + num_items)`` shall not overlap
709
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
710
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
711
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
712
+ //! - Segments are not required to be contiguous. For all index values ``i``
713
+ //! outside the specified segments ``d_keys.Current()[i]``,
714
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
715
+ //!
716
+ //! Snippet
717
+ //! +++++++++++++++++++++++++++++++++++++++++++++
718
+ //!
719
+ //! The code snippet below illustrates the batched sorting of three segments
720
+ //! (with one zero-length segment) of ``i`` nt keys.
721
+ //!
722
+ //! .. code-block:: c++
723
+ //!
724
+ //! #include <cub/cub.cuh>
725
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
726
+ //!
727
+ //! // Declare, allocate, and initialize device-accessible pointers for
728
+ //! // sorting data
729
+ //! int num_items; // e.g., 7
730
+ //! int num_segments; // e.g., 3
731
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
732
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
733
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
734
+ //! ...
735
+ //!
736
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
737
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
738
+ //!
739
+ //! // Determine temporary device storage requirements
740
+ //! void *d_temp_storage = nullptr;
741
+ //! size_t temp_storage_bytes = 0;
742
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
743
+ //! d_temp_storage, temp_storage_bytes, d_keys,
744
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
745
+ //!
746
+ //! // Allocate temporary storage
747
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
748
+ //!
749
+ //! // Run sorting operation
750
+ //! cub::DeviceSegmentedSort::SortKeysDescending(
751
+ //! d_temp_storage, temp_storage_bytes, d_keys,
752
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
753
+ //!
754
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
755
+ //!
756
+ //! @endrst
757
+ //!
758
+ //! @tparam KeyT
759
+ //! **[inferred]** Key type
760
+ //!
761
+ //! @tparam BeginOffsetIteratorT
762
+ //! **[inferred]** Random-access input iterator type for reading segment
763
+ //! beginning offsets @iterator
764
+ //!
765
+ //! @tparam EndOffsetIteratorT
766
+ //! **[inferred]** Random-access input iterator type for reading segment
767
+ //! ending offsets @iterator
768
+ //!
769
+ //! @param[in] d_temp_storage
770
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
771
+ //! required allocation size is written to `temp_storage_bytes` and no work
772
+ //! is done
773
+ //!
774
+ //! @param[in,out] temp_storage_bytes
775
+ //! Reference to size in bytes of `d_temp_storage` allocation
776
+ //!
777
+ //! @param[in,out] d_keys
778
+ //! Reference to the double-buffer of keys whose "current" device-accessible
779
+ //! buffer contains the unsorted input keys and, upon return, is updated to
780
+ //! point to the sorted output keys
781
+ //!
782
+ //! @param[in] num_items
783
+ //! The total number of items to sort (across all segments)
784
+ //!
785
+ //! @param[in] num_segments
786
+ //! The number of segments that comprise the sorting data
787
+ //!
788
+ //! @param[in] d_begin_offsets
789
+ //! @rst
790
+ //! Random-access input iterator to the sequence of beginning offsets of
791
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
792
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
793
+ //! @endrst
794
+ //!
795
+ //! @param[in] d_end_offsets
796
+ //! @rst
797
+ //! Random-access input iterator to the sequence of ending offsets of length
798
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
799
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
800
+ //! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
801
+ //! @endrst
802
+ //!
803
+ //! @param[in] stream
804
+ //! @rst
805
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
806
+ //! @endrst
807
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
808
+ CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
809
+ void* d_temp_storage,
810
+ size_t& temp_storage_bytes,
811
+ DoubleBuffer<KeyT>& d_keys,
812
+ ::cuda::std::int64_t num_items,
813
+ ::cuda::std::int64_t num_segments,
814
+ BeginOffsetIteratorT d_begin_offsets,
815
+ EndOffsetIteratorT d_end_offsets,
816
+ cudaStream_t stream = 0)
817
+ {
818
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
819
+ return SortKeysDescendingNoNVTX(
820
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
821
+ }
822
+
823
+ //! @rst
824
+ //! Sorts segments of keys into ascending order. Approximately
825
+ //! ``num_items + 2 * num_segments`` auxiliary storage required.
826
+ //!
827
+ //! - The contents of the input data are not altered by the sorting operation.
828
+ //! - When the input is a contiguous sequence of segments, a single sequence
829
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
830
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
831
+ //! the latter is specified as ``segment_offsets + 1``).
832
+ //! - StableSortKeys is stable: it preserves the relative ordering of
833
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
834
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
835
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
836
+ //! ``x`` still precedes ``y``.
837
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
838
+ //! ``[d_keys_in, d_keys_in + num_items)``,
839
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
840
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
841
+ //! - Segments are not required to be contiguous. For all index values ``i``
842
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
843
+ //! be accessed nor modified.
844
+ //!
845
+ //! Snippet
846
+ //! +++++++++++++++++++++++++++++++++++++++++++++
847
+ //!
848
+ //! The code snippet below illustrates the batched sorting of three segments
849
+ //! (with one zero-length segment) of ``i`` nt keys.
850
+ //!
851
+ //! .. code-block:: c++
852
+ //!
853
+ //! #include <cub/cub.cuh>
854
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
855
+ //!
856
+ //! // Declare, allocate, and initialize device-accessible pointers
857
+ //! // for sorting data
858
+ //! int num_items; // e.g., 7
859
+ //! int num_segments; // e.g., 3
860
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
861
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
862
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
863
+ //! ...
864
+ //!
865
+ //! // Determine temporary device storage requirements
866
+ //! void *d_temp_storage = nullptr;
867
+ //! size_t temp_storage_bytes = 0;
868
+ //! cub::DeviceSegmentedSort::StableSortKeys(
869
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
870
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
871
+ //!
872
+ //! // Allocate temporary storage
873
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
874
+ //!
875
+ //! // Run sorting operation
876
+ //! cub::DeviceSegmentedSort::StableSortKeys(
877
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
878
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
879
+ //!
880
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
881
+ //!
882
+ //! @endrst
883
+ //!
884
+ //! @tparam KeyT
885
+ //! **[inferred]** Key type
886
+ //!
887
+ //! @tparam BeginOffsetIteratorT
888
+ //! **[inferred]** Random-access input iterator type for reading segment
889
+ //! beginning offsets @iterator
890
+ //!
891
+ //! @tparam EndOffsetIteratorT
892
+ //! **[inferred]** Random-access input iterator type for reading segment
893
+ //! ending offsets @iterator
894
+ //!
895
+ //! @param[in] d_temp_storage
896
+ //! Device-accessible allocation of temporary storage. When nullptr, the
897
+ //! required allocation size is written to `temp_storage_bytes` and no work
898
+ //! is done
899
+ //!
900
+ //! @param[in,out] temp_storage_bytes
901
+ //! Reference to size in bytes of `d_temp_storage` allocation
902
+ //!
903
+ //! @param[in] d_keys_in
904
+ //! Device-accessible pointer to the input data of key data to sort
905
+ //!
906
+ //! @param[out] d_keys_out
907
+ //! Device-accessible pointer to the sorted output sequence of key data
908
+ //!
909
+ //! @param[in] num_items
910
+ //! The total number of items to sort (across all segments)
911
+ //!
912
+ //! @param[in] num_segments
913
+ //! The number of segments that comprise the sorting data
914
+ //!
915
+ //! @param[in] d_begin_offsets
916
+ //! @rst
917
+ //! Random-access input iterator to the sequence of beginning offsets of
918
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
919
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
920
+ //! @endrst
921
+ //!
922
+ //! @param[in] d_end_offsets
923
+ //! @rst
924
+ //! Random-access input iterator to the sequence of ending offsets of length
925
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
926
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
927
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
928
+ //! @endrst
929
+ //!
930
+ //! @param[in] stream
931
+ //! @rst
932
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
933
+ //! @endrst
934
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
935
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
936
+ void* d_temp_storage,
937
+ size_t& temp_storage_bytes,
938
+ const KeyT* d_keys_in,
939
+ KeyT* d_keys_out,
940
+ ::cuda::std::int64_t num_items,
941
+ ::cuda::std::int64_t num_segments,
942
+ BeginOffsetIteratorT d_begin_offsets,
943
+ EndOffsetIteratorT d_end_offsets,
944
+ cudaStream_t stream = 0)
945
+ {
946
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
947
+ return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
948
+ d_temp_storage,
949
+ temp_storage_bytes,
950
+ d_keys_in,
951
+ d_keys_out,
952
+ num_items,
953
+ num_segments,
954
+ d_begin_offsets,
955
+ d_end_offsets,
956
+ stream);
957
+ }
958
+
959
+ //! @rst
960
+ //! Sorts segments of keys into descending order.
961
+ //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
962
+ //!
963
+ //! - The contents of the input data are not altered by the sorting operation.
964
+ //! - When the input is a contiguous sequence of segments, a single sequence
965
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
966
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
967
+ //! the latter is specified as ``segment_offsets + 1``).
968
+ //! - StableSortKeysDescending is stable: it preserves the relative ordering of
969
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
970
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``)
971
+ //! then a postcondition of stable sort is that ``x`` still precedes ``y``.
972
+ //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
973
+ //! ``[d_keys_in, d_keys_in + num_items)``,
974
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
975
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
976
+ //! - Segments are not required to be contiguous. For all index values ``i``
977
+ //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
978
+ //! be accessed nor modified.
979
+ //!
980
+ //! Snippet
981
+ //! +++++++++++++++++++++++++++++++++++++++++++++
982
+ //!
983
+ //! The code snippet below illustrates the batched sorting of three segments
984
+ //! (with one zero-length segment) of ``i`` nt keys.
985
+ //!
986
+ //! .. code-block:: c++
987
+ //!
988
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
989
+ //!
990
+ //! // Declare, allocate, and initialize device-accessible pointers
991
+ //! // for sorting data
992
+ //! int num_items; // e.g., 7
993
+ //! int num_segments; // e.g., 3
994
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
995
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
996
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
997
+ //! ...
998
+ //!
999
+ //! // Determine temporary device storage requirements
1000
+ //! void *d_temp_storage = nullptr;
1001
+ //! size_t temp_storage_bytes = 0;
1002
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1003
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
1004
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1005
+ //!
1006
+ //! // Allocate temporary storage
1007
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1008
+ //!
1009
+ //! // Run sorting operation
1010
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1011
+ //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
1012
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1013
+ //!
1014
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
1015
+ //!
1016
+ //! @endrst
1017
+ //!
1018
+ //! @tparam KeyT
1019
+ //! **[inferred]** Key type
1020
+ //!
1021
+ //! @tparam BeginOffsetIteratorT
1022
+ //! **[inferred]** Random-access input iterator type for reading segment
1023
+ //! beginning offsets @iterator
1024
+ //!
1025
+ //! @tparam EndOffsetIteratorT
1026
+ //! **[inferred]** Random-access input iterator type for reading segment
1027
+ //! ending offsets @iterator
1028
+ //!
1029
+ //! @param[in] d_temp_storage
1030
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1031
+ //! required allocation size is written to `temp_storage_bytes` and no work
1032
+ //! is done.
1033
+ //!
1034
+ //! @param[in,out] temp_storage_bytes
1035
+ //! Reference to size in bytes of `d_temp_storage` allocation
1036
+ //!
1037
+ //! @param[in] d_keys_in
1038
+ //! Device-accessible pointer to the input data of key data to sort
1039
+ //!
1040
+ //! @param[out] d_keys_out
1041
+ //! Device-accessible pointer to the sorted output sequence of key data
1042
+ //!
1043
+ //! @param[in] num_items
1044
+ //! The total number of items to sort (across all segments)
1045
+ //!
1046
+ //! @param[in] num_segments
1047
+ //! The number of segments that comprise the sorting data
1048
+ //!
1049
+ //! @param[in] d_begin_offsets
1050
+ //! @rst
1051
+ //! Random-access input iterator to the sequence of beginning offsets of
1052
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1053
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
1054
+ //! ``d_values_*``
1055
+ //! @endrst
1056
+ //!
1057
+ //! @param[in] d_end_offsets
1058
+ //! @rst
1059
+ //! Random-access input iterator to the sequence of ending offsets of length
1060
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1061
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1062
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
1063
+ //! considered empty.
1064
+ //! @endrst
1065
+ //!
1066
+ //! @param[in] stream
1067
+ //! @rst
1068
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1069
+ //! @endrst
1070
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1071
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
1072
+ void* d_temp_storage,
1073
+ size_t& temp_storage_bytes,
1074
+ const KeyT* d_keys_in,
1075
+ KeyT* d_keys_out,
1076
+ ::cuda::std::int64_t num_items,
1077
+ ::cuda::std::int64_t num_segments,
1078
+ BeginOffsetIteratorT d_begin_offsets,
1079
+ EndOffsetIteratorT d_end_offsets,
1080
+ cudaStream_t stream = 0)
1081
+ {
1082
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1083
+ return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1084
+ d_temp_storage,
1085
+ temp_storage_bytes,
1086
+ d_keys_in,
1087
+ d_keys_out,
1088
+ num_items,
1089
+ num_segments,
1090
+ d_begin_offsets,
1091
+ d_end_offsets,
1092
+ stream);
1093
+ }
1094
+
1095
+ //! @rst
1096
+ //! Sorts segments of keys into ascending order.
1097
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1098
+ //!
1099
+ //! - The sorting operation is given a pair of key buffers managed by a
1100
+ //! DoubleBuffer structure that indicates which of the two buffers is
1101
+ //! "current" (and thus contains the input data to be sorted).
1102
+ //! - The contents of both buffers may be altered by the sorting operation.
1103
+ //! - Upon completion, the sorting operation will update the "current"
1104
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
1105
+ //! buffers now contains the sorted output sequence (a function of the number
1106
+ //! of key bits and the targeted device architecture).
1107
+ //! - When the input is a contiguous sequence of segments, a single sequence
1108
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1109
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1110
+ //! the latter is specified as ``segment_offsets + 1``).
1111
+ //! - StableSortKeys is stable: it preserves the relative ordering of
1112
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
1113
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
1114
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
1115
+ //! ``x`` still precedes ``y``.
1116
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
1117
+ //! The range ``[cur, cur + num_items)`` shall not overlap
1118
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1119
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1120
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1121
+ //! - Segments are not required to be contiguous. For all index values ``i``
1122
+ //! outside the specified segments ``d_keys.Current()[i]``,
1123
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
1124
+ //!
1125
+ //! Snippet
1126
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1127
+ //!
1128
+ //! The code snippet below illustrates the batched sorting of three segments
1129
+ //! (with one zero-length segment) of ``i`` nt keys.
1130
+ //!
1131
+ //! .. code-block:: c++
1132
+ //!
1133
+ //! #include <cub/cub.cuh>
1134
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1135
+ //!
1136
+ //! // Declare, allocate, and initialize device-accessible pointers
1137
+ //! // for sorting data
1138
+ //! int num_items; // e.g., 7
1139
+ //! int num_segments; // e.g., 3
1140
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1141
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1142
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1143
+ //! ...
1144
+ //!
1145
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
1146
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1147
+ //!
1148
+ //! // Determine temporary device storage requirements
1149
+ //! void *d_temp_storage = nullptr;
1150
+ //! size_t temp_storage_bytes = 0;
1151
+ //! cub::DeviceSegmentedSort::StableSortKeys(
1152
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1153
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1154
+ //!
1155
+ //! // Allocate temporary storage
1156
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1157
+ //!
1158
+ //! // Run sorting operation
1159
+ //! cub::DeviceSegmentedSort::StableSortKeys(
1160
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1161
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1162
+ //!
1163
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
1164
+ //!
1165
+ //! @endrst
1166
+ //!
1167
+ //! @tparam KeyT
1168
+ //! **[inferred]** Key type
1169
+ //!
1170
+ //! @tparam BeginOffsetIteratorT
1171
+ //! **[inferred]** Random-access input iterator type for reading segment
1172
+ //! beginning offsets @iterator
1173
+ //!
1174
+ //! @tparam EndOffsetIteratorT
1175
+ //! **[inferred]** Random-access input iterator type for reading segment
1176
+ //! ending offsets @iterator
1177
+ //!
1178
+ //! @param[in] d_temp_storage
1179
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1180
+ //! required allocation size is written to `temp_storage_bytes` and no work
1181
+ //! is done
1182
+ //!
1183
+ //! @param[in,out] temp_storage_bytes
1184
+ //! Reference to size in bytes of `d_temp_storage` allocation
1185
+ //!
1186
+ //! @param[in,out] d_keys
1187
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1188
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1189
+ //! point to the sorted output keys
1190
+ //!
1191
+ //! @param[in] num_items
1192
+ //! The total number of items to sort (across all segments)
1193
+ //!
1194
+ //! @param[in] num_segments
1195
+ //! The number of segments that comprise the sorting data
1196
+ //!
1197
+ //! @param[in] d_begin_offsets
1198
+ //! @rst
1199
+ //! Random-access input iterator to the sequence of beginning offsets of
1200
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1201
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1202
+ //! @endrst
1203
+ //!
1204
+ //! @param[in] d_end_offsets
1205
+ //! @rst
1206
+ //! Random-access input iterator to the sequence of ending offsets of length
1207
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1208
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1209
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
1210
+ //! considered empty.
1211
+ //! @endrst
1212
+ //!
1213
+ //! @param[in] stream
1214
+ //! @rst
1215
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1216
+ //! @endrst
1217
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1218
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
1219
+ void* d_temp_storage,
1220
+ size_t& temp_storage_bytes,
1221
+ DoubleBuffer<KeyT>& d_keys,
1222
+ ::cuda::std::int64_t num_items,
1223
+ ::cuda::std::int64_t num_segments,
1224
+ BeginOffsetIteratorT d_begin_offsets,
1225
+ EndOffsetIteratorT d_end_offsets,
1226
+ cudaStream_t stream = 0)
1227
+ {
1228
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1229
+ return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1230
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
1231
+ }
1232
+
1233
+ //! @rst
1234
+ //! Sorts segments of keys into descending order.
1235
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1236
+ //!
1237
+ //! - The sorting operation is given a pair of key buffers managed by a
1238
+ //! DoubleBuffer structure that indicates which of the two buffers is
1239
+ //! "current" (and thus contains the input data to be sorted).
1240
+ //! - The contents of both buffers may be altered by the sorting operation.
1241
+ //! - Upon completion, the sorting operation will update the "current"
1242
+ //! indicator within the DoubleBuffer wrapper to reference which of the two
1243
+ //! buffers now contains the sorted output sequence (a function of the number
1244
+ //! of key bits and the targeted device architecture).
1245
+ //! - When the input is a contiguous sequence of segments, a single sequence
1246
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1247
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1248
+ //! the latter is specified as ``segment_offsets + 1``).
1249
+ //! - StableSortKeysDescending is stable: it preserves the relative ordering of
1250
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
1251
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
1252
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
1253
+ //! ``x`` still precedes ``y``.
1254
+ //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
1255
+ //! The range ``[cur, cur + num_items)`` shall not overlap
1256
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1257
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1258
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1259
+ //! - Segments are not required to be contiguous. For all index values ```i`
1260
+ //! outside the specified segments ``d_keys.Current()[i]``,
1261
+ //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
1262
+ //!
1263
+ //! Snippet
1264
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1265
+ //!
1266
+ //! The code snippet below illustrates the batched sorting of three segments
1267
+ //! (with one zero-length segment) of ``i`` nt keys.
1268
+ //!
1269
+ //! .. code-block:: c++
1270
+ //!
1271
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
1272
+ //!
1273
+ //! // Declare, allocate, and initialize device-accessible pointers
1274
+ //! // for sorting data
1275
+ //! int num_items; // e.g., 7
1276
+ //! int num_segments; // e.g., 3
1277
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1278
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1279
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1280
+ //! ...
1281
+ //!
1282
+ //! // Create a DoubleBuffer to wrap the pair of device pointers
1283
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1284
+ //!
1285
+ //! // Determine temporary device storage requirements
1286
+ //! void *d_temp_storage = nullptr;
1287
+ //! size_t temp_storage_bytes = 0;
1288
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1289
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1290
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1291
+ //!
1292
+ //! // Allocate temporary storage
1293
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1294
+ //!
1295
+ //! // Run sorting operation
1296
+ //! cub::DeviceSegmentedSort::StableSortKeysDescending(
1297
+ //! d_temp_storage, temp_storage_bytes, d_keys,
1298
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1299
+ //!
1300
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
1301
+ //!
1302
+ //! @endrst
1303
+ //!
1304
+ //! @tparam KeyT
1305
+ //! **[inferred]** Key type
1306
+ //!
1307
+ //! @tparam BeginOffsetIteratorT
1308
+ //! **[inferred]** Random-access input iterator type for reading segment
1309
+ //! beginning offsets @iterator
1310
+ //!
1311
+ //! @tparam EndOffsetIteratorT
1312
+ //! **[inferred]** Random-access input iterator type for reading segment
1313
+ //! ending offsets @iterator
1314
+ //!
1315
+ //! @param[in] d_temp_storage
1316
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1317
+ //! required allocation size is written to `temp_storage_bytes` and no work
1318
+ //! is done.
1319
+ //!
1320
+ //! @param[in,out] temp_storage_bytes
1321
+ //! Reference to size in bytes of `d_temp_storage` allocation
1322
+ //!
1323
+ //! @param[in,out] d_keys
1324
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1325
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1326
+ //! point to the sorted output keys
1327
+ //!
1328
+ //! @param[in] num_items
1329
+ //! The total number of items to sort (across all segments)
1330
+ //!
1331
+ //! @param[in] num_segments
1332
+ //! The number of segments that comprise the sorting data
1333
+ //!
1334
+ //! @param[in] d_begin_offsets
1335
+ //! @rst
1336
+ //! Random-access input iterator to the sequence of beginning offsets of
1337
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1338
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1339
+ //! @endrst
1340
+ //!
1341
+ //! @param[in] d_end_offsets
1342
+ //! @rst
1343
+ //! Random-access input iterator to the sequence of ending offsets of length
1344
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
1345
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
1346
+ //! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
1347
+ //! ``i``-th segment is considered empty.
1348
+ //! @endrst
1349
+ //!
1350
+ //! @param[in] stream
1351
+ //! @rst
1352
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1353
+ //! @endrst
1354
+ template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1355
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
1356
+ void* d_temp_storage,
1357
+ size_t& temp_storage_bytes,
1358
+ DoubleBuffer<KeyT>& d_keys,
1359
+ ::cuda::std::int64_t num_items,
1360
+ ::cuda::std::int64_t num_segments,
1361
+ BeginOffsetIteratorT d_begin_offsets,
1362
+ EndOffsetIteratorT d_end_offsets,
1363
+ cudaStream_t stream = 0)
1364
+ {
1365
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1366
+ return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
1367
+ d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
1368
+ }
1369
+
1370
+ private:
1371
+ // Internal version without NVTX range
1372
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1373
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
1374
+ void* d_temp_storage,
1375
+ size_t& temp_storage_bytes,
1376
+ const KeyT* d_keys_in,
1377
+ KeyT* d_keys_out,
1378
+ const ValueT* d_values_in,
1379
+ ValueT* d_values_out,
1380
+ ::cuda::std::int64_t num_items,
1381
+ ::cuda::std::int64_t num_segments,
1382
+ BeginOffsetIteratorT d_begin_offsets,
1383
+ EndOffsetIteratorT d_end_offsets,
1384
+ cudaStream_t stream = 0)
1385
+ {
1386
+ constexpr bool is_overwrite_okay = false;
1387
+
1388
+ using OffsetT =
1389
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1390
+ using DispatchT =
1391
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1392
+
1393
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1394
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1395
+
1396
+ return DispatchT::Dispatch(
1397
+ d_temp_storage,
1398
+ temp_storage_bytes,
1399
+ d_keys,
1400
+ d_values,
1401
+ num_items,
1402
+ num_segments,
1403
+ d_begin_offsets,
1404
+ d_end_offsets,
1405
+ is_overwrite_okay,
1406
+ stream);
1407
+ }
1408
+
1409
+ public:
1410
+ //! @} end member group
1411
+ //! @name Key-value pairs
1412
+ //! @{
1413
+
1414
+ //! @rst
1415
+ //! Sorts segments of key-value pairs into ascending order.
1416
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
1417
+ //!
1418
+ //! - The contents of the input data are not altered by the sorting operation.
1419
+ //! - When the input is a contiguous sequence of segments, a single sequence
1420
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1421
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1422
+ //! the latter is specified as ``segment_offsets + 1``).
1423
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1424
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1425
+ //! guaranteed that the relative order of these two elements will be
1426
+ //! preserved by sort.
1427
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
1428
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
1429
+ //! not overlap ``[in, in + num_items)``,
1430
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1431
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1432
+ //! - Segments are not required to be contiguous. For all index values ``i``
1433
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
1434
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
1435
+ //!
1436
+ //! Snippet
1437
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1438
+ //!
1439
+ //! The code snippet below illustrates the batched sorting of three segments
1440
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1441
+ //! ``i`` nt values.
1442
+ //!
1443
+ //! .. code-block:: c++
1444
+ //!
1445
+ //! #include <cub/cub.cuh>
1446
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1447
+ //!
1448
+ //! // Declare, allocate, and initialize device-accessible pointers
1449
+ //! // for sorting data
1450
+ //! int num_items; // e.g., 7
1451
+ //! int num_segments; // e.g., 3
1452
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1453
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1454
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
1455
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1456
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
1457
+ //! ...
1458
+ //!
1459
+ //! // Determine temporary device storage requirements
1460
+ //! void *d_temp_storage = nullptr;
1461
+ //! size_t temp_storage_bytes = 0;
1462
+ //! cub::DeviceSegmentedSort::SortPairs(
1463
+ //! d_temp_storage, temp_storage_bytes,
1464
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1465
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1466
+ //!
1467
+ //! // Allocate temporary storage
1468
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1469
+ //!
1470
+ //! // Run sorting operation
1471
+ //! cub::DeviceSegmentedSort::SortPairs(
1472
+ //! d_temp_storage, temp_storage_bytes,
1473
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1474
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1475
+ //!
1476
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
1477
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
1478
+ //!
1479
+ //! @endrst
1480
+ //!
1481
+ //! @tparam KeyT
1482
+ //! **[inferred]** Key type
1483
+ //!
1484
+ //! @tparam ValueT
1485
+ //! **[inferred]** Value type
1486
+ //!
1487
+ //! @tparam BeginOffsetIteratorT
1488
+ //! **[inferred]** Random-access input iterator type for reading segment
1489
+ //! beginning offsets @iterator
1490
+ //!
1491
+ //! @tparam EndOffsetIteratorT
1492
+ //! **[inferred]** Random-access input iterator type for reading segment
1493
+ //! ending offsets @iterator
1494
+ //!
1495
+ //! @param[in] d_temp_storage
1496
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1497
+ //! required allocation size is written to `temp_storage_bytes` and no work
1498
+ //! is done
1499
+ //!
1500
+ //! @param[in,out] temp_storage_bytes
1501
+ //! Reference to size in bytes of `d_temp_storage` allocation
1502
+ //!
1503
+ //! @param[in] d_keys_in
1504
+ //! Device-accessible pointer to the input data of key data to sort
1505
+ //!
1506
+ //! @param[out] d_keys_out
1507
+ //! Device-accessible pointer to the sorted output sequence of key data
1508
+ //!
1509
+ //! @param[in] d_values_in
1510
+ //! Device-accessible pointer to the corresponding input sequence of
1511
+ //! associated value items
1512
+ //!
1513
+ //! @param[out] d_values_out
1514
+ //! Device-accessible pointer to the correspondingly-reordered output
1515
+ //! sequence of associated value items
1516
+ //!
1517
+ //! @param[in] num_items
1518
+ //! The total number of items to sort (across all segments)
1519
+ //!
1520
+ //! @param[in] num_segments
1521
+ //! The number of segments that comprise the sorting data
1522
+ //!
1523
+ //! @param[in] d_begin_offsets
1524
+ //! @rst
1525
+ //! Random-access input iterator to the sequence of beginning offsets of
1526
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1527
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1528
+ //! @endrst
1529
+ //!
1530
+ //! @param[in] d_end_offsets
1531
+ //! @rst
1532
+ //! Random-access input iterator to the sequence of ending offsets of length
1533
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1534
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1535
+ //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
1536
+ //! considered empty.
1537
+ //! @endrst
1538
+ //!
1539
+ //! @param[in] stream
1540
+ //! @rst
1541
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1542
+ //! @endrst
1543
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1544
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
1545
+ void* d_temp_storage,
1546
+ size_t& temp_storage_bytes,
1547
+ const KeyT* d_keys_in,
1548
+ KeyT* d_keys_out,
1549
+ const ValueT* d_values_in,
1550
+ ValueT* d_values_out,
1551
+ ::cuda::std::int64_t num_items,
1552
+ ::cuda::std::int64_t num_segments,
1553
+ BeginOffsetIteratorT d_begin_offsets,
1554
+ EndOffsetIteratorT d_end_offsets,
1555
+ cudaStream_t stream = 0)
1556
+ {
1557
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1558
+ return SortPairsNoNVTX(
1559
+ d_temp_storage,
1560
+ temp_storage_bytes,
1561
+ d_keys_in,
1562
+ d_keys_out,
1563
+ d_values_in,
1564
+ d_values_out,
1565
+ num_items,
1566
+ num_segments,
1567
+ d_begin_offsets,
1568
+ d_end_offsets,
1569
+ stream);
1570
+ }
1571
+
1572
+ private:
1573
+ // Internal version without NVTX range
1574
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1575
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
1576
+ void* d_temp_storage,
1577
+ size_t& temp_storage_bytes,
1578
+ const KeyT* d_keys_in,
1579
+ KeyT* d_keys_out,
1580
+ const ValueT* d_values_in,
1581
+ ValueT* d_values_out,
1582
+ ::cuda::std::int64_t num_items,
1583
+ ::cuda::std::int64_t num_segments,
1584
+ BeginOffsetIteratorT d_begin_offsets,
1585
+ EndOffsetIteratorT d_end_offsets,
1586
+ cudaStream_t stream = 0)
1587
+ {
1588
+ constexpr bool is_overwrite_okay = false;
1589
+
1590
+ using OffsetT =
1591
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1592
+ using DispatchT =
1593
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1594
+
1595
+ DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
1596
+ DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
1597
+
1598
+ return DispatchT::Dispatch(
1599
+ d_temp_storage,
1600
+ temp_storage_bytes,
1601
+ d_keys,
1602
+ d_values,
1603
+ num_items,
1604
+ num_segments,
1605
+ d_begin_offsets,
1606
+ d_end_offsets,
1607
+ is_overwrite_okay,
1608
+ stream);
1609
+ }
1610
+
1611
+ public:
1612
+ //! @rst
1613
+ //! Sorts segments of key-value pairs into descending order.
1614
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
1615
+ //!
1616
+ //! - The contents of the input data are not altered by the sorting operation.
1617
+ //! - When the input is a contiguous sequence of segments, a single sequence
1618
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1619
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1620
+ //! the latter is specified as ``segment_offsets + 1``).
1621
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1622
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1623
+ //! guaranteed that the relative order of these two elements will be
1624
+ //! preserved by sort.
1625
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
1626
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
1627
+ //! not overlap ``[in, in + num_items)``,
1628
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1629
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1630
+ //! - Segments are not required to be contiguous. For all index values ``i``
1631
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
1632
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
1633
+ //!
1634
+ //! Snippet
1635
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1636
+ //!
1637
+ //! The code snippet below illustrates the batched sorting of three segments
1638
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1639
+ //! ``i`` nt values.
1640
+ //!
1641
+ //! .. code-block:: c++
1642
+ //!
1643
+ //! #include <cub/cub.cuh>
1644
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1645
+ //!
1646
+ //! // Declare, allocate, and initialize device-accessible pointers for
1647
+ //! // sorting data
1648
+ //! int num_items; // e.g., 7
1649
+ //! int num_segments; // e.g., 3
1650
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1651
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
1652
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
1653
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
1654
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
1655
+ //! ...
1656
+ //!
1657
+ //! // Determine temporary device storage requirements
1658
+ //! void *d_temp_storage = nullptr;
1659
+ //! size_t temp_storage_bytes = 0;
1660
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
1661
+ //! d_temp_storage, temp_storage_bytes,
1662
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1663
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1664
+ //!
1665
+ //! // Allocate temporary storage
1666
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1667
+ //!
1668
+ //! // Run sorting operation
1669
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
1670
+ //! d_temp_storage, temp_storage_bytes,
1671
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
1672
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1673
+ //!
1674
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
1675
+ //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
1676
+ //!
1677
+ //! @endrst
1678
+ //!
1679
+ //! @tparam KeyT
1680
+ //! **[inferred]** Key type
1681
+ //!
1682
+ //! @tparam ValueT
1683
+ //! **[inferred]** Value type
1684
+ //!
1685
+ //! @tparam BeginOffsetIteratorT
1686
+ //! **[inferred]** Random-access input iterator type for reading segment
1687
+ //! beginning offsets @iterator
1688
+ //!
1689
+ //! @tparam EndOffsetIteratorT
1690
+ //! **[inferred]** Random-access input iterator type for reading segment
1691
+ //! ending offsets @iterator
1692
+ //!
1693
+ //! @param[in] d_temp_storage
1694
+ //! Device-accessible allocation of temporary storage. When nullptr, the
1695
+ //! required allocation size is written to `temp_storage_bytes` and no work
1696
+ //! is done.
1697
+ //!
1698
+ //! @param[in,out] temp_storage_bytes
1699
+ //! Reference to size in bytes of `d_temp_storage` allocation
1700
+ //!
1701
+ //! @param[in] d_keys_in
1702
+ //! Device-accessible pointer to the input data of key data to sort
1703
+ //!
1704
+ //! @param[out] d_keys_out
1705
+ //! Device-accessible pointer to the sorted output sequence of key data
1706
+ //!
1707
+ //! @param[in] d_values_in
1708
+ //! Device-accessible pointer to the corresponding input sequence of
1709
+ //! associated value items
1710
+ //!
1711
+ //! @param[out] d_values_out
1712
+ //! Device-accessible pointer to the correspondingly-reordered output
1713
+ //! sequence of associated value items
1714
+ //!
1715
+ //! @param[in] num_items
1716
+ //! The total number of items to sort (across all segments)
1717
+ //!
1718
+ //! @param[in] num_segments
1719
+ //! The number of segments that comprise the sorting data
1720
+ //!
1721
+ //! @param[in] d_begin_offsets
1722
+ //! @rst
1723
+ //! Random-access input iterator to the sequence of beginning offsets of
1724
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1725
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1726
+ //! @endrst
1727
+ //!
1728
+ //! @param[in] d_end_offsets
1729
+ //! @rst
1730
+ //! Random-access input iterator to the sequence of ending offsets of length
1731
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1732
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1733
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
1734
+ //! considered empty.
1735
+ //! @endrst
1736
+ //!
1737
+ //! @param[in] stream
1738
+ //! @rst
1739
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1740
+ //! @endrst
1741
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1742
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
1743
+ void* d_temp_storage,
1744
+ size_t& temp_storage_bytes,
1745
+ const KeyT* d_keys_in,
1746
+ KeyT* d_keys_out,
1747
+ const ValueT* d_values_in,
1748
+ ValueT* d_values_out,
1749
+ ::cuda::std::int64_t num_items,
1750
+ ::cuda::std::int64_t num_segments,
1751
+ BeginOffsetIteratorT d_begin_offsets,
1752
+ EndOffsetIteratorT d_end_offsets,
1753
+ cudaStream_t stream = 0)
1754
+ {
1755
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1756
+ return SortPairsDescendingNoNVTX(
1757
+ d_temp_storage,
1758
+ temp_storage_bytes,
1759
+ d_keys_in,
1760
+ d_keys_out,
1761
+ d_values_in,
1762
+ d_values_out,
1763
+ num_items,
1764
+ num_segments,
1765
+ d_begin_offsets,
1766
+ d_end_offsets,
1767
+ stream);
1768
+ }
1769
+
1770
+ private:
1771
+ // Internal version without NVTX range
1772
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1773
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
1774
+ void* d_temp_storage,
1775
+ size_t& temp_storage_bytes,
1776
+ DoubleBuffer<KeyT>& d_keys,
1777
+ DoubleBuffer<ValueT>& d_values,
1778
+ ::cuda::std::int64_t num_items,
1779
+ ::cuda::std::int64_t num_segments,
1780
+ BeginOffsetIteratorT d_begin_offsets,
1781
+ EndOffsetIteratorT d_end_offsets,
1782
+ cudaStream_t stream = 0)
1783
+ {
1784
+ constexpr bool is_overwrite_okay = true;
1785
+
1786
+ using OffsetT =
1787
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1788
+ using DispatchT =
1789
+ DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1790
+
1791
+ return DispatchT::Dispatch(
1792
+ d_temp_storage,
1793
+ temp_storage_bytes,
1794
+ d_keys,
1795
+ d_values,
1796
+ num_items,
1797
+ num_segments,
1798
+ d_begin_offsets,
1799
+ d_end_offsets,
1800
+ is_overwrite_okay,
1801
+ stream);
1802
+ }
1803
+
1804
+ public:
1805
+ //! @rst
1806
+ //! Sorts segments of key-value pairs into ascending order.
1807
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
1808
+ //!
1809
+ //! - The sorting operation is given a pair of key buffers and a corresponding
1810
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
1811
+ //! structure that indicates which of the two buffers is "current" (and thus
1812
+ //! contains the input data to be sorted).
1813
+ //! - The contents of both buffers within each pair may be altered by the sorting
1814
+ //! operation.
1815
+ //! - Upon completion, the sorting operation will update the "current" indicator
1816
+ //! within each DoubleBuffer wrapper to reference which of the two buffers
1817
+ //! now contains the sorted output sequence (a function of the number of key bits
1818
+ //! specified and the targeted device architecture).
1819
+ //! - When the input is a contiguous sequence of segments, a single sequence
1820
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
1821
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
1822
+ //! the latter is specified as ``segment_offsets + 1``).
1823
+ //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
1824
+ //! ``j`` are equivalent: neither one is less than the other. It is not
1825
+ //! guaranteed that the relative order of these two elements will be
1826
+ //! preserved by sort.
1827
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
1828
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
1829
+ //! ``[cur, cur + num_items)`` shall not overlap
1830
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
1831
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1832
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
1833
+ //! - Segments are not required to be contiguous. For all index values ``i``
1834
+ //! outside the specified segments ``d_keys.Current()[i]``,
1835
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
1836
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
1837
+ //!
1838
+ //! Snippet
1839
+ //! +++++++++++++++++++++++++++++++++++++++++++++
1840
+ //!
1841
+ //! The code snippet below illustrates the batched sorting of three segments
1842
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
1843
+ //! ``i`` nt values.
1844
+ //!
1845
+ //! .. code-block:: c++
1846
+ //!
1847
+ //! #include <cub/cub.cuh>
1848
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
1849
+ //!
1850
+ //! // Declare, allocate, and initialize device-accessible pointers
1851
+ //! // for sorting data
1852
+ //! int num_items; // e.g., 7
1853
+ //! int num_segments; // e.g., 3
1854
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
1855
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
1856
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
1857
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
1858
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
1859
+ //! ...
1860
+ //!
1861
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
1862
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
1863
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
1864
+ //!
1865
+ //! // Determine temporary device storage requirements
1866
+ //! void *d_temp_storage = nullptr;
1867
+ //! size_t temp_storage_bytes = 0;
1868
+ //! cub::DeviceSegmentedSort::SortPairs(
1869
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
1870
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1871
+ //!
1872
+ //! // Allocate temporary storage
1873
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
1874
+ //!
1875
+ //! // Run sorting operation
1876
+ //! cub::DeviceSegmentedSort::SortPairs(
1877
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
1878
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
1879
+ //!
1880
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
1881
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
1882
+ //!
1883
+ //! @endrst
1884
+ //!
1885
+ //! @tparam KeyT
1886
+ //! **[inferred]** Key type
1887
+ //!
1888
+ //! @tparam ValueT
1889
+ //! **[inferred]** Value type
1890
+ //!
1891
+ //! @tparam BeginOffsetIteratorT
1892
+ //! **[inferred]** Random-access input iterator type for reading segment
1893
+ //! beginning offsets @iterator
1894
+ //!
1895
+ //! @tparam EndOffsetIteratorT
1896
+ //! **[inferred]** Random-access input iterator type for reading segment
1897
+ //! ending offsets @iterator
1898
+ //!
1899
+ //! @param[in] d_temp_storage
1900
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
1901
+ //! required allocation size is written to `temp_storage_bytes` and no work
1902
+ //! is done.
1903
+ //!
1904
+ //! @param[in,out] temp_storage_bytes
1905
+ //! Reference to size in bytes of `d_temp_storage` allocation
1906
+ //!
1907
+ //! @param[in,out] d_keys
1908
+ //! Reference to the double-buffer of keys whose "current" device-accessible
1909
+ //! buffer contains the unsorted input keys and, upon return, is updated to
1910
+ //! point to the sorted output keys
1911
+ //!
1912
+ //! @param[in,out] d_values
1913
+ //! Double-buffer of values whose "current" device-accessible buffer contains
1914
+ //! the unsorted input values and, upon return, is updated to point to the
1915
+ //! sorted output values
1916
+ //!
1917
+ //! @param[in] num_items
1918
+ //! The total number of items to sort (across all segments)
1919
+ //!
1920
+ //! @param[in] num_segments
1921
+ //! The number of segments that comprise the sorting data
1922
+ //!
1923
+ //! @param[in] d_begin_offsets
1924
+ //! @rst
1925
+ //! Random-access input iterator to the sequence of beginning offsets of
1926
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1927
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1928
+ //! @endrst
1929
+ //!
1930
+ //! @param[in] d_end_offsets
1931
+ //! @rst
1932
+ //! Random-access input iterator to the sequence of ending offsets of length
1933
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1934
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1935
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
1936
+ //! considered empty.
1937
+ //! @endrst
1938
+ //!
1939
+ //! @param[in] stream
1940
+ //! @rst
1941
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
1942
+ //! @endrst
1943
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1944
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
1945
+ void* d_temp_storage,
1946
+ size_t& temp_storage_bytes,
1947
+ DoubleBuffer<KeyT>& d_keys,
1948
+ DoubleBuffer<ValueT>& d_values,
1949
+ ::cuda::std::int64_t num_items,
1950
+ ::cuda::std::int64_t num_segments,
1951
+ BeginOffsetIteratorT d_begin_offsets,
1952
+ EndOffsetIteratorT d_end_offsets,
1953
+ cudaStream_t stream = 0)
1954
+ {
1955
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
1956
+ return SortPairsNoNVTX(
1957
+ d_temp_storage,
1958
+ temp_storage_bytes,
1959
+ d_keys,
1960
+ d_values,
1961
+ num_items,
1962
+ num_segments,
1963
+ d_begin_offsets,
1964
+ d_end_offsets,
1965
+ stream);
1966
+ }
1967
+
1968
+ private:
1969
+ // Internal version without NVTX range
1970
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
1971
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
1972
+ void* d_temp_storage,
1973
+ size_t& temp_storage_bytes,
1974
+ DoubleBuffer<KeyT>& d_keys,
1975
+ DoubleBuffer<ValueT>& d_values,
1976
+ ::cuda::std::int64_t num_items,
1977
+ ::cuda::std::int64_t num_segments,
1978
+ BeginOffsetIteratorT d_begin_offsets,
1979
+ EndOffsetIteratorT d_end_offsets,
1980
+ cudaStream_t stream = 0)
1981
+ {
1982
+ constexpr bool is_overwrite_okay = true;
1983
+
1984
+ using OffsetT =
1985
+ detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
1986
+ using DispatchT =
1987
+ DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
1988
+
1989
+ return DispatchT::Dispatch(
1990
+ d_temp_storage,
1991
+ temp_storage_bytes,
1992
+ d_keys,
1993
+ d_values,
1994
+ num_items,
1995
+ num_segments,
1996
+ d_begin_offsets,
1997
+ d_end_offsets,
1998
+ is_overwrite_okay,
1999
+ stream);
2000
+ }
2001
+
2002
+ public:
2003
+ //! @rst
2004
+ //! Sorts segments of key-value pairs into descending order.
2005
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
2006
+ //!
2007
+ //! - The sorting operation is given a pair of key buffers and a corresponding
2008
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
2009
+ //! structure that indicates which of the two buffers is "current" (and thus
2010
+ //! contains the input data to be sorted).
2011
+ //! - The contents of both buffers within each pair may be altered by the
2012
+ //! sorting operation.
2013
+ //! - Upon completion, the sorting operation will update the "current"
2014
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
2015
+ //! buffers now contains the sorted output sequence (a function of the number
2016
+ //! of key bits specified and the targeted device architecture).
2017
+ //! - When the input is a contiguous sequence of segments, a single sequence
2018
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2019
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2020
+ //! the latter is specified as ``segment_offsets + 1``).
2021
+ //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that
2022
+ //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
2023
+ //! not guaranteed that the relative order of these two elements will be
2024
+ //! preserved by sort.
2025
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2026
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2027
+ //! ``[cur, cur + num_items)`` shall not overlap
2028
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2029
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2030
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2031
+ //! - Segments are not required to be contiguous. For all index values ``i``
2032
+ //! outside the specified segments ``d_keys.Current()[i]``,
2033
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2034
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2035
+ //!
2036
+ //! Snippet
2037
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2038
+ //!
2039
+ //! The code snippet below illustrates the batched sorting of three segments
2040
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2041
+ //! ``i`` nt values.
2042
+ //!
2043
+ //! .. code-block:: c++
2044
+ //!
2045
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2046
+ //!
2047
+ //! // Declare, allocate, and initialize device-accessible pointers for
2048
+ //! // sorting data
2049
+ //! int num_items; // e.g., 7
2050
+ //! int num_segments; // e.g., 3
2051
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2052
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2053
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2054
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2055
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2056
+ //! ...
2057
+ //!
2058
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2059
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2060
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2061
+ //!
2062
+ //! // Determine temporary device storage requirements
2063
+ //! void *d_temp_storage = nullptr;
2064
+ //! size_t temp_storage_bytes = 0;
2065
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
2066
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2067
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2068
+ //!
2069
+ //! // Allocate temporary storage
2070
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2071
+ //!
2072
+ //! // Run sorting operation
2073
+ //! cub::DeviceSegmentedSort::SortPairsDescending(
2074
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2075
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2076
+ //!
2077
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
2078
+ //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
2079
+ //!
2080
+ //! @endrst
2081
+ //!
2082
+ //! @tparam KeyT
2083
+ //! **[inferred]** Key type
2084
+ //!
2085
+ //! @tparam ValueT
2086
+ //! **[inferred]** Value type
2087
+ //!
2088
+ //! @tparam BeginOffsetIteratorT
2089
+ //! **[inferred]** Random-access input iterator type for reading segment
2090
+ //! beginning offsets @iterator
2091
+ //!
2092
+ //! @tparam EndOffsetIteratorT
2093
+ //! **[inferred]** Random-access input iterator type for reading segment
2094
+ //! ending offsets @iterator
2095
+ //!
2096
+ //! @param[in] d_temp_storage
2097
+ //! Device-accessible allocation of temporary storage. When nullptr, the
2098
+ //! required allocation size is written to `temp_storage_bytes` and no work
2099
+ //! is done
2100
+ //!
2101
+ //! @param[in,out] temp_storage_bytes
2102
+ //! Reference to size in bytes of `d_temp_storage` allocation
2103
+ //!
2104
+ //! @param[in,out] d_keys
2105
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2106
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2107
+ //! point to the sorted output keys
2108
+ //!
2109
+ //! @param[in,out] d_values
2110
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2111
+ //! the unsorted input values and, upon return, is updated to point to the
2112
+ //! sorted output values
2113
+ //!
2114
+ //! @param[in] num_items
2115
+ //! The total number of items to sort (across all segments)
2116
+ //!
2117
+ //! @param[in] num_segments
2118
+ //! The number of segments that comprise the sorting data
2119
+ //!
2120
+ //! @param[in] d_begin_offsets
2121
+ //! @rst
2122
+ //! Random-access input iterator to the sequence of beginning offsets of
2123
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2124
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2125
+ //! @endrst
2126
+ //!
2127
+ //! @param[in] d_end_offsets
2128
+ //! @rst
2129
+ //! Random-access input iterator to the sequence of ending offsets of length
2130
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2131
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2132
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2133
+ //! considered empty.
2134
+ //! @endrst
2135
+ //!
2136
+ //! @param[in] stream
2137
+ //! @rst
2138
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2139
+ //! @endrst
2140
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2141
+ CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
2142
+ void* d_temp_storage,
2143
+ size_t& temp_storage_bytes,
2144
+ DoubleBuffer<KeyT>& d_keys,
2145
+ DoubleBuffer<ValueT>& d_values,
2146
+ ::cuda::std::int64_t num_items,
2147
+ ::cuda::std::int64_t num_segments,
2148
+ BeginOffsetIteratorT d_begin_offsets,
2149
+ EndOffsetIteratorT d_end_offsets,
2150
+ cudaStream_t stream = 0)
2151
+ {
2152
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2153
+ return SortPairsDescendingNoNVTX(
2154
+ d_temp_storage,
2155
+ temp_storage_bytes,
2156
+ d_keys,
2157
+ d_values,
2158
+ num_items,
2159
+ num_segments,
2160
+ d_begin_offsets,
2161
+ d_end_offsets,
2162
+ stream);
2163
+ }
2164
+
2165
+ //! @rst
2166
+ //! Sorts segments of key-value pairs into ascending order.
2167
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
2168
+ //!
2169
+ //! - The contents of the input data are not altered by the sorting operation.
2170
+ //! - When the input is a contiguous sequence of segments, a single sequence
2171
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2172
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2173
+ //! the latter is specified as ``segment_offsets + 1``).
2174
+ //! - StableSortPairs is stable: it preserves the relative ordering of
2175
+ //! equivalent elements. That is, if ``x`` and ``y`` are elements such that
2176
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2177
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2178
+ //! ``x`` still precedes ``y``.
2179
+ //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
2180
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
2181
+ //! not overlap ``[in, in + num_items)``,
2182
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2183
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2184
+ //! - Segments are not required to be contiguous. For all index values ``i``
2185
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
2186
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
2187
+ //!
2188
+ //! Snippet
2189
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2190
+ //!
2191
+ //! The code snippet below illustrates the batched sorting of three segments
2192
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2193
+ //! ``i`` nt values.
2194
+ //!
2195
+ //! .. code-block:: c++
2196
+ //!
2197
+ //! #include <cub/cub.cuh>
2198
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
2199
+ //!
2200
+ //! // Declare, allocate, and initialize device-accessible pointers
2201
+ //! // for sorting data
2202
+ //! int num_items; // e.g., 7
2203
+ //! int num_segments; // e.g., 3
2204
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2205
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2206
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
2207
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
2208
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
2209
+ //! ...
2210
+ //!
2211
+ //! // Determine temporary device storage requirements
2212
+ //! void *d_temp_storage = nullptr;
2213
+ //! size_t temp_storage_bytes = 0;
2214
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2215
+ //! d_temp_storage, temp_storage_bytes,
2216
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2217
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2218
+ //!
2219
+ //! // Allocate temporary storage
2220
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2221
+ //!
2222
+ //! // Run sorting operation
2223
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2224
+ //! d_temp_storage, temp_storage_bytes,
2225
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2226
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2227
+ //!
2228
+ //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
2229
+ //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
2230
+ //!
2231
+ //! @endrst
2232
+ //!
2233
+ //! @tparam KeyT
2234
+ //! **[inferred]** Key type
2235
+ //!
2236
+ //! @tparam ValueT
2237
+ //! **[inferred]** Value type
2238
+ //!
2239
+ //! @tparam BeginOffsetIteratorT
2240
+ //! **[inferred]** Random-access input iterator type for reading segment
2241
+ //! beginning offsets @iterator
2242
+ //!
2243
+ //! @tparam EndOffsetIteratorT
2244
+ //! **[inferred]** Random-access input iterator type for reading segment
2245
+ //! ending offsets @iterator
2246
+ //!
2247
+ //! @param[in] d_temp_storage
2248
+ //! Device-accessible allocation of temporary storage. When nullptr, the
2249
+ //! required allocation size is written to `temp_storage_bytes` and no work is done.
2250
+ //!
2251
+ //! @param[in,out] temp_storage_bytes
2252
+ //! Reference to size in bytes of `d_temp_storage` allocation
2253
+ //!
2254
+ //! @param[in] d_keys_in
2255
+ //! Device-accessible pointer to the input data of key data to sort
2256
+ //!
2257
+ //! @param[out] d_keys_out
2258
+ //! Device-accessible pointer to the sorted output sequence of key data
2259
+ //!
2260
+ //! @param[in] d_values_in
2261
+ //! Device-accessible pointer to the corresponding input sequence of
2262
+ //! associated value items
2263
+ //!
2264
+ //! @param[out] d_values_out
2265
+ //! Device-accessible pointer to the correspondingly-reordered output
2266
+ //! sequence of associated value items
2267
+ //!
2268
+ //! @param[in] num_items
2269
+ //! The total number of items to sort (across all segments)
2270
+ //!
2271
+ //! @param[in] num_segments
2272
+ //! The number of segments that comprise the sorting data
2273
+ //!
2274
+ //! @param[in] d_begin_offsets
2275
+ //! @rst
2276
+ //! Random-access input iterator to the sequence of beginning offsets of
2277
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2278
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2279
+ //! @endrst
2280
+ //!
2281
+ //! @param[in] d_end_offsets
2282
+ //! @rst
2283
+ //! Random-access input iterator to the sequence of ending offsets of length
2284
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2285
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2286
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2287
+ //! considered empty.
2288
+ //! @endrst
2289
+ //!
2290
+ //! @param[in] stream
2291
+ //! @rst
2292
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2293
+ //! @endrst
2294
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2295
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
2296
+ void* d_temp_storage,
2297
+ size_t& temp_storage_bytes,
2298
+ const KeyT* d_keys_in,
2299
+ KeyT* d_keys_out,
2300
+ const ValueT* d_values_in,
2301
+ ValueT* d_values_out,
2302
+ ::cuda::std::int64_t num_items,
2303
+ ::cuda::std::int64_t num_segments,
2304
+ BeginOffsetIteratorT d_begin_offsets,
2305
+ EndOffsetIteratorT d_end_offsets,
2306
+ cudaStream_t stream = 0)
2307
+ {
2308
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2309
+ return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2310
+ d_temp_storage,
2311
+ temp_storage_bytes,
2312
+ d_keys_in,
2313
+ d_keys_out,
2314
+ d_values_in,
2315
+ d_values_out,
2316
+ num_items,
2317
+ num_segments,
2318
+ d_begin_offsets,
2319
+ d_end_offsets,
2320
+ stream);
2321
+ }
2322
+
2323
+ //! @rst
2324
+ //! Sorts segments of key-value pairs into descending order.
2325
+ //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
2326
+ //!
2327
+ //! - The contents of the input data are not altered by the sorting operation.
2328
+ //! - When the input is a contiguous sequence of segments, a single sequence
2329
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2330
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2331
+ //! the latter is specified as ``segment_offsets + 1``).
2332
+ //! - StableSortPairsDescending is stable: it preserves the relative ordering
2333
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2334
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2335
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2336
+ //! ``x`` still precedes ``y``.
2337
+ //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
2338
+ //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
2339
+ //! not overlap ``[in, in + num_items)``,
2340
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2341
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2342
+ //! - Segments are not required to be contiguous. For all index values ``i``
2343
+ //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
2344
+ //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
2345
+ //!
2346
+ //! Snippet
2347
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2348
+ //!
2349
+ //! The code snippet below illustrates the batched sorting of three segments
2350
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2351
+ //! ``i`` nt values.
2352
+ //!
2353
+ //! .. code-block:: c++
2354
+ //!
2355
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2356
+ //!
2357
+ //! // Declare, allocate, and initialize device-accessible pointers
2358
+ //! // for sorting data
2359
+ //! int num_items; // e.g., 7
2360
+ //! int num_segments; // e.g., 3
2361
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2362
+ //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
2363
+ //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
2364
+ //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
2365
+ //! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
2366
+ //! ...
2367
+ //!
2368
+ //! // Determine temporary device storage requirements
2369
+ //! void *d_temp_storage = nullptr;
2370
+ //! size_t temp_storage_bytes = 0;
2371
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2372
+ //! d_temp_storage, temp_storage_bytes,
2373
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2374
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2375
+ //!
2376
+ //! // Allocate temporary storage
2377
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2378
+ //!
2379
+ //! // Run sorting operation
2380
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2381
+ //! d_temp_storage, temp_storage_bytes,
2382
+ //! d_keys_in, d_keys_out, d_values_in, d_values_out,
2383
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2384
+ //!
2385
+ //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
2386
+ //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
2387
+ //!
2388
+ //! @endrst
2389
+ //!
2390
+ //! @tparam KeyT
2391
+ //! **[inferred]** Key type
2392
+ //!
2393
+ //! @tparam ValueT
2394
+ //! **[inferred]** Value type
2395
+ //!
2396
+ //! @tparam BeginOffsetIteratorT
2397
+ //! **[inferred]** Random-access input iterator type for reading segment
2398
+ //! beginning offsets @iterator
2399
+ //!
2400
+ //! @tparam EndOffsetIteratorT
2401
+ //! **[inferred]** Random-access input iterator type for reading segment
2402
+ //! ending offsets @iterator
2403
+ //!
2404
+ //! @param[in] d_temp_storage
2405
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2406
+ //! required allocation size is written to `temp_storage_bytes` and no work
2407
+ //! is done
2408
+ //!
2409
+ //! @param[in,out] temp_storage_bytes
2410
+ //! Reference to size in bytes of `d_temp_storage` allocation
2411
+ //!
2412
+ //! @param[in] d_keys_in
2413
+ //! Device-accessible pointer to the input data of key data to sort
2414
+ //!
2415
+ //! @param[out] d_keys_out
2416
+ //! Device-accessible pointer to the sorted output sequence of key data
2417
+ //!
2418
+ //! @param[in] d_values_in
2419
+ //! Device-accessible pointer to the corresponding input sequence of
2420
+ //! associated value items
2421
+ //!
2422
+ //! @param[out] d_values_out
2423
+ //! Device-accessible pointer to the correspondingly-reordered output
2424
+ //! sequence of associated value items
2425
+ //!
2426
+ //! @param[in] num_items
2427
+ //! The total number of items to sort (across all segments)
2428
+ //!
2429
+ //! @param[in] num_segments
2430
+ //! The number of segments that comprise the sorting data
2431
+ //!
2432
+ //! @param[in] d_begin_offsets
2433
+ //! @rst
2434
+ //! Random-access input iterator to the sequence of beginning offsets of
2435
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2436
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2437
+ //! @endrst
2438
+ //!
2439
+ //! @param[in] d_end_offsets
2440
+ //! @rst
2441
+ //! Random-access input iterator to the sequence of ending offsets of length
2442
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2443
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2444
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2445
+ //! considered empty.
2446
+ //! @endrst
2447
+ //!
2448
+ //! @param[in] stream
2449
+ //! @rst
2450
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2451
+ //! @endrst
2452
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2453
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
2454
+ void* d_temp_storage,
2455
+ size_t& temp_storage_bytes,
2456
+ const KeyT* d_keys_in,
2457
+ KeyT* d_keys_out,
2458
+ const ValueT* d_values_in,
2459
+ ValueT* d_values_out,
2460
+ ::cuda::std::int64_t num_items,
2461
+ ::cuda::std::int64_t num_segments,
2462
+ BeginOffsetIteratorT d_begin_offsets,
2463
+ EndOffsetIteratorT d_end_offsets,
2464
+ cudaStream_t stream = 0)
2465
+ {
2466
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2467
+ return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2468
+ d_temp_storage,
2469
+ temp_storage_bytes,
2470
+ d_keys_in,
2471
+ d_keys_out,
2472
+ d_values_in,
2473
+ d_values_out,
2474
+ num_items,
2475
+ num_segments,
2476
+ d_begin_offsets,
2477
+ d_end_offsets,
2478
+ stream);
2479
+ }
2480
+
2481
+ //! @rst
2482
+ //! Sorts segments of key-value pairs into ascending order.
2483
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
2484
+ //!
2485
+ //! - The sorting operation is given a pair of key buffers and a corresponding
2486
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
2487
+ //! structure that indicates which of the two buffers is "current" (and thus
2488
+ //! contains the input data to be sorted).
2489
+ //! - The contents of both buffers within each pair may be altered by the
2490
+ //! sorting operation.
2491
+ //! - Upon completion, the sorting operation will update the "current"
2492
+ //! indicator within each DoubleBuffer wrapper to reference which of the two
2493
+ //! buffers now contains the sorted output sequence (a function of the number
2494
+ //! of key bits specified and the targeted device architecture).
2495
+ //! - When the input is a contiguous sequence of segments, a single sequence
2496
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2497
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2498
+ //! the latter is specified as ``segment_offsets + 1``).
2499
+ //! - StableSortPairs is stable: it preserves the relative ordering
2500
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2501
+ //! ``x`` precedes `y`, and if the two elements are equivalent (neither
2502
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2503
+ //! ``x`` still precedes ``y``.
2504
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2505
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2506
+ //! ``[cur, cur + num_items)`` shall not overlap
2507
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2508
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2509
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2510
+ //! - Segments are not required to be contiguous. For all index values ``i``
2511
+ //! outside the specified segments ``d_keys.Current()[i]``,
2512
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2513
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2514
+ //!
2515
+ //! Snippet
2516
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2517
+ //!
2518
+ //! The code snippet below illustrates the batched sorting of three segments
2519
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2520
+ //! ``i`` nt values.
2521
+ //!
2522
+ //! .. code-block:: c++
2523
+ //!
2524
+ //! #include <cub/cub.cuh>
2525
+ //! // or equivalently <cub/device/device_segmented_sort.cuh>
2526
+ //!
2527
+ //! // Declare, allocate, and initialize device-accessible pointers
2528
+ //! // for sorting data
2529
+ //! int num_items; // e.g., 7
2530
+ //! int num_segments; // e.g., 3
2531
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2532
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2533
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2534
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2535
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2536
+ //! ...
2537
+ //!
2538
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2539
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2540
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2541
+ //!
2542
+ //! // Determine temporary device storage requirements
2543
+ //! void *d_temp_storage = nullptr;
2544
+ //! size_t temp_storage_bytes = 0;
2545
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2546
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2547
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2548
+ //!
2549
+ //! // Allocate temporary storage
2550
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2551
+ //!
2552
+ //! // Run sorting operation
2553
+ //! cub::DeviceSegmentedSort::StableSortPairs(
2554
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2555
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2556
+ //!
2557
+ //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
2558
+ //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
2559
+ //!
2560
+ //! @endrst
2561
+ //!
2562
+ //! @tparam KeyT
2563
+ //! **[inferred]** Key type
2564
+ //!
2565
+ //! @tparam ValueT
2566
+ //! **[inferred]** Value type
2567
+ //!
2568
+ //! @tparam BeginOffsetIteratorT
2569
+ //! **[inferred]** Random-access input iterator type for reading segment
2570
+ //! beginning offsets @iterator
2571
+ //!
2572
+ //! @tparam EndOffsetIteratorT
2573
+ //! **[inferred]** Random-access input iterator type for reading segment
2574
+ //! ending offsets @iterator
2575
+ //!
2576
+ //! @param[in] d_temp_storage
2577
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2578
+ //! required allocation size is written to `temp_storage_bytes` and no work
2579
+ //! is done
2580
+ //!
2581
+ //! @param[in,out] temp_storage_bytes
2582
+ //! Reference to size in bytes of `d_temp_storage` allocation
2583
+ //!
2584
+ //! @param[in,out] d_keys
2585
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2586
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2587
+ //! point to the sorted output keys
2588
+ //!
2589
+ //! @param[in,out] d_values
2590
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2591
+ //! the unsorted input values and, upon return, is updated to point to the
2592
+ //! sorted output values
2593
+ //!
2594
+ //! @param[in] num_items
2595
+ //! The total number of items to sort (across all segments)
2596
+ //!
2597
+ //! @param[in] num_segments
2598
+ //! The number of segments that comprise the sorting data
2599
+ //!
2600
+ //! @param[in] d_begin_offsets
2601
+ //! @rst
2602
+ //! Random-access input iterator to the sequence of beginning offsets of
2603
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2604
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2605
+ //! @endrst
2606
+ //!
2607
+ //! @param[in] d_end_offsets
2608
+ //! @rst
2609
+ //! Random-access input iterator to the sequence of ending offsets of length
2610
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2611
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2612
+ //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
2613
+ //! considered empty.
2614
+ //! @endrst
2615
+ //!
2616
+ //! @param[in] stream
2617
+ //! @rst
2618
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2619
+ //! @endrst
2620
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2621
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
2622
+ void* d_temp_storage,
2623
+ size_t& temp_storage_bytes,
2624
+ DoubleBuffer<KeyT>& d_keys,
2625
+ DoubleBuffer<ValueT>& d_values,
2626
+ ::cuda::std::int64_t num_items,
2627
+ ::cuda::std::int64_t num_segments,
2628
+ BeginOffsetIteratorT d_begin_offsets,
2629
+ EndOffsetIteratorT d_end_offsets,
2630
+ cudaStream_t stream = 0)
2631
+ {
2632
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2633
+ return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2634
+ d_temp_storage,
2635
+ temp_storage_bytes,
2636
+ d_keys,
2637
+ d_values,
2638
+ num_items,
2639
+ num_segments,
2640
+ d_begin_offsets,
2641
+ d_end_offsets,
2642
+ stream);
2643
+ }
2644
+
2645
+ //! @rst
2646
+ //! Sorts segments of key-value pairs into descending order.
2647
+ //! Approximately ``2 * num_segments`` auxiliary storage required.
2648
+ //!
2649
+ //! - The sorting operation is given a pair of key buffers and a corresponding
2650
+ //! pair of associated value buffers. Each pair is managed by a DoubleBuffer
2651
+ //! structure that indicates which of the two buffers is "current" (and thus
2652
+ //! contains the input data to be sorted).
2653
+ //! - The contents of both buffers within each pair may be altered by the sorting
2654
+ //! operation.
2655
+ //! - Upon completion, the sorting operation will update the "current" indicator
2656
+ //! within each DoubleBuffer wrapper to reference which of the two buffers
2657
+ //! now contains the sorted output sequence (a function of the number of key bits
2658
+ //! specified and the targeted device architecture).
2659
+ //! - When the input is a contiguous sequence of segments, a single sequence
2660
+ //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
2661
+ //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
2662
+ //! the latter is specified as ``segment_offsets + 1``).
2663
+ //! - StableSortPairsDescending is stable: it preserves the relative ordering
2664
+ //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
2665
+ //! ``x`` precedes ``y``, and if the two elements are equivalent (neither
2666
+ //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
2667
+ //! ``x`` still precedes ``y``.
2668
+ //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
2669
+ //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
2670
+ //! ``[cur, cur + num_items)`` shall not overlap
2671
+ //! ``[alt, alt + num_items)``. Both ranges shall not overlap
2672
+ //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2673
+ //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
2674
+ //! - Segments are not required to be contiguous. For all index values ``i``
2675
+ //! outside the specified segments ``d_keys.Current()[i]``,
2676
+ //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
2677
+ //! ``d_values.Alternate()[i]`` will not be accessed nor modified.
2678
+ //!
2679
+ //! Snippet
2680
+ //! +++++++++++++++++++++++++++++++++++++++++++++
2681
+ //!
2682
+ //! The code snippet below illustrates the batched sorting of three segments
2683
+ //! (with one zero-length segment) of ``i`` nt keys with associated vector of
2684
+ //! ``i`` nt values.
2685
+ //!
2686
+ //! .. code-block:: c++
2687
+ //!
2688
+ //! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
2689
+ //!
2690
+ //! // Declare, allocate, and initialize device-accessible pointers
2691
+ //! // for sorting data
2692
+ //! int num_items; // e.g., 7
2693
+ //! int num_segments; // e.g., 3
2694
+ //! int *d_offsets; // e.g., [0, 3, 3, 7]
2695
+ //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
2696
+ //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
2697
+ //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
2698
+ //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
2699
+ //! ...
2700
+ //!
2701
+ //! // Create a set of DoubleBuffers to wrap pairs of device pointers
2702
+ //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
2703
+ //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
2704
+ //!
2705
+ //! // Determine temporary device storage requirements
2706
+ //! void *d_temp_storage = nullptr;
2707
+ //! size_t temp_storage_bytes = 0;
2708
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2709
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2710
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2711
+ //!
2712
+ //! // Allocate temporary storage
2713
+ //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
2714
+ //!
2715
+ //! // Run sorting operation
2716
+ //! cub::DeviceSegmentedSort::StableSortPairsDescending(
2717
+ //! d_temp_storage, temp_storage_bytes, d_keys, d_values,
2718
+ //! num_items, num_segments, d_offsets, d_offsets + 1);
2719
+ //!
2720
+ //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
2721
+ //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
2722
+ //!
2723
+ //! @endrst
2724
+ //!
2725
+ //! @tparam KeyT
2726
+ //! **[inferred]** Key type
2727
+ //!
2728
+ //! @tparam ValueT
2729
+ //! **[inferred]** Value type
2730
+ //!
2731
+ //! @tparam BeginOffsetIteratorT
2732
+ //! **[inferred]** Random-access input iterator type for reading segment
2733
+ //! beginning offsets @iterator
2734
+ //!
2735
+ //! @tparam EndOffsetIteratorT
2736
+ //! **[inferred]** Random-access input iterator type for reading segment
2737
+ //! ending offsets @iterator
2738
+ //!
2739
+ //! @param[in] d_temp_storage
2740
+ //! Device-accessible allocation of temporary storage. When `nullptr`, the
2741
+ //! required allocation size is written to `temp_storage_bytes` and no work
2742
+ //! is done
2743
+ //!
2744
+ //! @param[in,out] temp_storage_bytes
2745
+ //! Reference to size in bytes of `d_temp_storage` allocation
2746
+ //!
2747
+ //! @param[in,out] d_keys
2748
+ //! Reference to the double-buffer of keys whose "current" device-accessible
2749
+ //! buffer contains the unsorted input keys and, upon return, is updated to
2750
+ //! point to the sorted output keys
2751
+ //!
2752
+ //! @param[in,out] d_values
2753
+ //! Double-buffer of values whose "current" device-accessible buffer contains
2754
+ //! the unsorted input values and, upon return, is updated to point to the
2755
+ //! sorted output values
2756
+ //!
2757
+ //! @param[in] num_items
2758
+ //! The total number of items to sort (across all segments)
2759
+ //!
2760
+ //! @param[in] num_segments
2761
+ //! The number of segments that comprise the sorting data
2762
+ //!
2763
+ //! @param[in] d_begin_offsets
2764
+ //! @rst
2765
+ //! Random-access input iterator to the sequence of beginning offsets of
2766
+ //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
2767
+ //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
2768
+ //! @endrst
2769
+ //!
2770
+ //! @param[in] d_end_offsets
2771
+ //! @rst
2772
+ //! Random-access input iterator to the sequence of ending offsets of length
2773
+ //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
2774
+ //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
2775
+ //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
2776
+ //! considered empty.
2777
+ //! @endrst
2778
+ //!
2779
+ //! @param[in] stream
2780
+ //! @rst
2781
+ //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
2782
+ //! @endrst
2783
+ template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
2784
+ CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
2785
+ void* d_temp_storage,
2786
+ size_t& temp_storage_bytes,
2787
+ DoubleBuffer<KeyT>& d_keys,
2788
+ DoubleBuffer<ValueT>& d_values,
2789
+ ::cuda::std::int64_t num_items,
2790
+ ::cuda::std::int64_t num_segments,
2791
+ BeginOffsetIteratorT d_begin_offsets,
2792
+ EndOffsetIteratorT d_end_offsets,
2793
+ cudaStream_t stream = 0)
2794
+ {
2795
+ _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
2796
+ return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
2797
+ d_temp_storage,
2798
+ temp_storage_bytes,
2799
+ d_keys,
2800
+ d_values,
2801
+ num_items,
2802
+ num_segments,
2803
+ d_begin_offsets,
2804
+ d_end_offsets,
2805
+ stream);
2806
+ }
2807
+
2808
+ //! @} end member group
2809
+ };
2810
+
2811
+ CUB_NAMESPACE_END