cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2024) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
  24. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
  25. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  26. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  27. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  28. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
  29. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
  30. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  31. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
  32. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  33. cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
  34. cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
  35. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
  36. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
  38. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
  39. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
  40. cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
  41. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  42. cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
  43. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
  44. cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
  45. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  52. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  53. cuda/cccl/headers/include/cub/config.cuh +29 -0
  54. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  55. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  56. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  57. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  58. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  59. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  60. cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
  61. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  62. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
  63. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
  64. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
  65. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
  71. cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
  72. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  73. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  74. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  75. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  76. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  77. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  78. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  79. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  80. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  81. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  82. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  83. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  84. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  85. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  86. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  87. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  88. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
  89. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  90. cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
  93. cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
  94. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  95. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  96. cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
  97. cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  159. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  160. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  161. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
  162. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  163. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  164. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  165. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
  166. cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
  167. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  168. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  169. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  170. cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
  171. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  172. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  173. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  174. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  175. cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
  176. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  177. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  178. cuda/cccl/headers/include/cub/util_device.cuh +838 -0
  179. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  180. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  181. cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
  182. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  183. cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
  184. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
  185. cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
  186. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  187. cuda/cccl/headers/include/cub/version.cuh +65 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  189. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  190. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
  191. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  192. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
  193. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  194. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  195. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  196. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  197. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  198. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
  199. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  200. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  201. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  202. cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
  203. cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
  204. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  205. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
  206. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
  207. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  208. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  209. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  211. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  212. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  213. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  214. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  215. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
  216. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  217. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  218. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
  219. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  220. cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
  221. cuda/cccl/headers/include/cuda/__cccl_config +38 -0
  222. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
  223. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  224. cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
  225. cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
  226. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  227. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
  228. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  229. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  230. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  231. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  232. cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
  233. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  234. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  235. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  236. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  237. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  238. cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
  239. cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
  240. cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
  241. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  242. cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
  243. cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
  244. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  245. cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  254. cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
  255. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  256. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  257. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  258. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  259. cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
  260. cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
  261. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  262. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  263. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  264. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  265. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  266. cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
  267. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  268. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  269. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  270. cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
  271. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
  272. cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
  273. cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
  274. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  275. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
  276. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  277. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  278. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  279. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  280. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  281. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  282. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  283. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  284. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
  285. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  286. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
  287. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
  288. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  289. cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
  290. cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
  291. cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
  292. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
  293. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
  294. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  295. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
  296. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
  297. cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
  298. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  299. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  300. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
  301. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  302. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  303. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  304. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  305. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  306. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  307. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  308. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
  309. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  310. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
  311. cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
  312. cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
  313. cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
  314. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  315. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  316. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  317. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  318. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
  319. cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
  320. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
  321. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  322. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
  323. cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
  324. cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
  325. cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
  326. cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
  327. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
  328. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  329. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  330. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  331. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
  332. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
  333. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
  334. cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
  335. cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
  336. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
  337. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  338. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  339. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  340. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  341. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
  342. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  343. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  422. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  423. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  424. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  425. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  426. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  427. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
  428. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  429. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  430. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  431. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  432. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  433. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  434. cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
  435. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  436. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  437. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  438. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  439. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  440. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  441. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  442. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  443. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  444. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  445. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  446. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  447. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  448. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  449. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  450. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  451. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  452. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  453. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  454. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  455. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  456. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
  457. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  458. cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
  459. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  460. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  461. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  462. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  463. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  464. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  465. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
  466. cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
  467. cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
  468. cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
  469. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
  470. cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
  471. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  472. cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
  473. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  474. cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
  475. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  476. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  477. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  478. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  479. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  480. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  481. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  482. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
  483. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
  484. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
  485. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  486. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  487. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  488. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
  489. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  490. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  491. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  492. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  493. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
  494. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  495. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  496. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  497. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  498. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  499. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  500. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  501. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  502. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  503. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  504. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  505. cuda/cccl/headers/include/cuda/access_property +26 -0
  506. cuda/cccl/headers/include/cuda/algorithm +28 -0
  507. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  508. cuda/cccl/headers/include/cuda/atomic +27 -0
  509. cuda/cccl/headers/include/cuda/barrier +293 -0
  510. cuda/cccl/headers/include/cuda/bit +29 -0
  511. cuda/cccl/headers/include/cuda/buffer +27 -0
  512. cuda/cccl/headers/include/cuda/cmath +38 -0
  513. cuda/cccl/headers/include/cuda/devices +33 -0
  514. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  515. cuda/cccl/headers/include/cuda/functional +32 -0
  516. cuda/cccl/headers/include/cuda/hierarchy +28 -0
  517. cuda/cccl/headers/include/cuda/iterator +39 -0
  518. cuda/cccl/headers/include/cuda/latch +27 -0
  519. cuda/cccl/headers/include/cuda/launch +28 -0
  520. cuda/cccl/headers/include/cuda/mdspan +29 -0
  521. cuda/cccl/headers/include/cuda/memory +37 -0
  522. cuda/cccl/headers/include/cuda/memory_pool +27 -0
  523. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  524. cuda/cccl/headers/include/cuda/numeric +31 -0
  525. cuda/cccl/headers/include/cuda/pipeline +580 -0
  526. cuda/cccl/headers/include/cuda/ptx +131 -0
  527. cuda/cccl/headers/include/cuda/semaphore +31 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  582. cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
  583. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  584. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  585. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  586. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  587. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  588. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  589. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  590. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  591. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
  592. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
  593. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  594. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  595. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  596. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
  597. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  598. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  599. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  600. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  601. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  602. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  603. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  605. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  606. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  607. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  608. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  609. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  610. cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
  611. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  612. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  613. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  614. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  615. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  616. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  617. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  618. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  619. cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
  620. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  621. cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
  622. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  623. cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
  624. cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
  625. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  626. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  627. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  628. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  629. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  630. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  631. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  632. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  633. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  634. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  635. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  637. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  638. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
  639. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  640. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  641. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  642. cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
  643. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  644. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  645. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  646. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
  647. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
  648. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  649. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  650. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  651. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  652. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  653. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  654. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  655. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  656. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  657. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
  659. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  660. cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
  661. cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
  662. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  663. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  664. cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
  665. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
  666. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  667. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
  668. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  670. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  671. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  672. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
  673. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
  674. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  675. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
  677. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  678. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
  679. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
  680. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  681. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  682. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
  683. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
  684. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  685. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  686. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  687. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  688. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
  689. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
  690. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
  691. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  692. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  693. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  694. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  695. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  696. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  697. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  698. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
  699. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  700. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
  701. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  702. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  703. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  704. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  705. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  706. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  708. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  710. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  711. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  712. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  713. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  714. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  715. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  716. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  717. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  718. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  719. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  720. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  721. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  722. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  723. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  724. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  725. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
  726. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
  727. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  728. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  729. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  730. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  731. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  732. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  733. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  734. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  735. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  736. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  737. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  738. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  739. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  740. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  741. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
  742. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
  743. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
  744. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  745. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  746. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
  747. cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
  748. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  749. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  750. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  751. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  752. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  753. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  754. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  755. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  756. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  757. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  758. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  759. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  760. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
  761. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  762. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  763. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  764. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  765. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  766. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  767. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  768. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  769. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  770. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  771. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  772. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  773. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  774. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  775. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  776. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  777. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  778. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  779. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  780. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  781. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  782. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  783. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  784. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  785. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
  786. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
  787. cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
  788. cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
  789. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
  790. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  792. cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
  793. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  794. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  795. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
  796. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  797. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  798. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  799. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  800. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  801. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  802. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
  803. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  804. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  805. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
  806. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  807. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  808. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  809. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  810. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  811. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  812. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  813. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  814. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  815. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  816. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  817. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  818. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  819. cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
  820. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  821. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  822. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  823. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
  824. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  825. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  826. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  827. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  828. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  829. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  830. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  831. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  832. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  833. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  834. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  835. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  836. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  837. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
  838. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
  839. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  840. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  841. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  842. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
  843. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  844. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  845. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  846. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
  847. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  848. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  849. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  850. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  851. cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
  852. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  853. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  854. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  855. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  856. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
  857. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  858. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  859. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  860. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  861. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  862. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  863. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  864. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  865. cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
  866. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  867. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  868. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  869. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  870. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  871. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  872. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  873. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  874. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  875. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  876. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  877. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  878. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  879. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  880. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  881. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  882. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  883. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  884. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  885. cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
  886. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  887. cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
  888. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
  889. cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
  890. cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
  891. cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
  892. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  893. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
  894. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
  895. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  896. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  897. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
  898. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  899. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  900. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
  901. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  902. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
  904. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  905. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  906. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  907. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  908. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  909. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  910. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  911. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  912. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  913. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  914. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  915. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  916. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  917. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  918. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  919. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  920. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  921. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  923. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  924. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  925. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  926. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  927. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  928. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  929. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  930. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  931. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  932. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
  933. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  934. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
  935. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  936. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
  937. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  938. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  939. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  940. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
  941. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
  942. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  943. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  944. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
  945. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
  946. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
  947. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
  948. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
  949. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
  950. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  951. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
  952. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  953. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
  954. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  955. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  956. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
  957. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
  958. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  959. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  960. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
  961. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  962. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  964. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  965. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
  966. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  967. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  968. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  970. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  971. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  972. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  973. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  974. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  976. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  978. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  979. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  980. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  981. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  982. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  983. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  984. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  985. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  986. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  987. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  988. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  989. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  990. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  991. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  992. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  993. cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
  994. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
  995. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  996. cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
  997. cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
  998. cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
  999. cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
  1000. cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
  1001. cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
  1002. cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
  1003. cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
  1004. cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
  1005. cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
  1006. cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
  1007. cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
  1008. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  1009. cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
  1010. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  1011. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
  1012. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  1013. cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
  1014. cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
  1015. cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
  1016. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  1017. cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
  1018. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  1019. cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
  1020. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
  1021. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
  1022. cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
  1023. cuda/cccl/headers/include/cuda/std/__random_ +47 -0
  1024. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  1025. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  1026. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
  1027. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  1028. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  1029. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  1030. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  1031. cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
  1032. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  1033. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  1034. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  1035. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  1036. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  1037. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
  1038. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
  1039. cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
  1040. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
  1041. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
  1042. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  1043. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  1044. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  1045. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
  1046. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  1047. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  1048. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
  1049. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
  1050. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  1051. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  1052. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
  1053. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  1054. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  1055. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  1056. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  1057. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
  1058. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
  1059. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  1060. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  1061. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  1062. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  1063. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  1064. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  1065. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  1067. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  1068. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  1069. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  1070. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  1071. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  1072. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  1073. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  1074. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  1075. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  1076. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  1077. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1078. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1079. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1080. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1081. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1082. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1083. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1084. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1085. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1086. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1150. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1151. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1152. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1153. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1154. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1155. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1156. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1157. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
  1158. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1159. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1160. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1161. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1162. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1163. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1164. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1165. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1166. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1167. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1168. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1169. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1170. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1171. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1172. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1173. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1174. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1175. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1176. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1177. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1178. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1179. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1180. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1181. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1182. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1183. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1184. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1185. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1186. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1187. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1188. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1189. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1190. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1191. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1192. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1193. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1194. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1195. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1196. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1197. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1198. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1199. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1200. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1201. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1202. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1203. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1204. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1205. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1206. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1207. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1208. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1209. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1210. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1211. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1212. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1213. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1214. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1215. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1216. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1217. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1218. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1219. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1220. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1221. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1222. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1223. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1224. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1225. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1227. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1228. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
  1229. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1230. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
  1231. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1232. cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
  1233. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1234. cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
  1235. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  1236. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1237. cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
  1238. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
  1239. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1240. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1241. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1242. cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
  1243. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1244. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1245. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
  1246. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1247. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1248. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1249. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1250. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1251. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1252. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1253. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1254. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1255. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1256. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1257. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1258. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1259. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1260. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1261. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1262. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1263. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1264. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1265. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1266. cuda/cccl/headers/include/cuda/std/algorithm +138 -0
  1267. cuda/cccl/headers/include/cuda/std/array +519 -0
  1268. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1269. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1270. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1271. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1272. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1273. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1274. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1275. cuda/cccl/headers/include/cuda/std/charconv +31 -0
  1276. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1277. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1278. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1279. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1280. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1281. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1282. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1283. cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
  1284. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1285. cuda/cccl/headers/include/cuda/std/ctime +155 -0
  1286. cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
  1287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1288. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1289. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1290. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1291. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1292. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1293. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1294. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1295. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1296. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1297. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1298. cuda/cccl/headers/include/cuda/std/memory +40 -0
  1299. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1300. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1301. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1302. cuda/cccl/headers/include/cuda/std/ranges +70 -0
  1303. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1304. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1305. cuda/cccl/headers/include/cuda/std/source_location +107 -0
  1306. cuda/cccl/headers/include/cuda/std/span +599 -0
  1307. cuda/cccl/headers/include/cuda/std/string_view +924 -0
  1308. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1309. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1310. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1311. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1312. cuda/cccl/headers/include/cuda/std/version +240 -0
  1313. cuda/cccl/headers/include/cuda/stream +32 -0
  1314. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1315. cuda/cccl/headers/include/cuda/tma +25 -0
  1316. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1317. cuda/cccl/headers/include/cuda/utility +28 -0
  1318. cuda/cccl/headers/include/cuda/version +16 -0
  1319. cuda/cccl/headers/include/cuda/warp +28 -0
  1320. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1321. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1322. cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
  1323. cuda/cccl/headers/include/nv/target +241 -0
  1324. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1325. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1326. cuda/cccl/headers/include/thrust/advance.h +60 -0
  1327. cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
  1328. cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
  1329. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1330. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1331. cuda/cccl/headers/include/thrust/count.h +245 -0
  1332. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1333. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1334. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
  1335. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
  1336. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1337. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1338. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1339. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1340. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1341. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1342. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1343. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
  1344. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1345. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1346. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
  1347. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
  1348. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
  1349. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
  1350. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
  1351. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
  1352. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
  1353. cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
  1354. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
  1355. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1356. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
  1357. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
  1358. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1359. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
  1360. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1361. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1362. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
  1363. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
  1364. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
  1365. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1366. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1367. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1368. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1369. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1370. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1371. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1372. cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
  1373. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1374. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1375. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
  1376. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
  1377. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1378. cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
  1379. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1380. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1381. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1382. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1383. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1384. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1385. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1386. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1387. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1388. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1389. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1390. cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
  1391. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1392. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1393. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1394. cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
  1395. cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
  1396. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1397. cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
  1398. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1399. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1400. cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
  1401. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1402. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1403. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1404. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
  1405. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1406. cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
  1407. cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
  1408. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1409. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1410. cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
  1411. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1412. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1413. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1414. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1415. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1416. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
  1417. cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
  1418. cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
  1419. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1420. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1421. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1422. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1423. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1424. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1425. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1426. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1427. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1428. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1429. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1430. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1431. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1432. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1433. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1434. cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
  1435. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
  1436. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
  1437. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1438. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1439. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1440. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1441. cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
  1442. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
  1443. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1444. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1445. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1446. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1447. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1448. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
  1449. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
  1450. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1451. cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
  1452. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1453. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
  1454. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1455. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1456. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1457. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
  1458. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1459. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1460. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1461. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1462. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1463. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1464. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1465. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1466. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1467. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1468. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1469. cuda/cccl/headers/include/thrust/distance.h +44 -0
  1470. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1471. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1472. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1473. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1474. cuda/cccl/headers/include/thrust/find.h +382 -0
  1475. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1476. cuda/cccl/headers/include/thrust/functional.h +399 -0
  1477. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1478. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1479. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1480. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1481. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1482. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
  1483. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1484. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1485. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1486. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1487. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
  1488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1491. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
  1492. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
  1493. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1494. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1495. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1496. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
  1497. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1498. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1499. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1500. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1501. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1502. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1503. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1504. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1505. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1506. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1507. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
  1508. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1509. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1510. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1511. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
  1512. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1513. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
  1514. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1515. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1516. cuda/cccl/headers/include/thrust/merge.h +726 -0
  1517. cuda/cccl/headers/include/thrust/mismatch.h +262 -0
  1518. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1519. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1520. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1521. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1522. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1523. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1524. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1525. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1526. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1527. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1528. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1529. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1530. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1531. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1532. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1533. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1534. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1535. cuda/cccl/headers/include/thrust/partition.h +1392 -0
  1536. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1537. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1538. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1539. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1540. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1541. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1542. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1543. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1544. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
  1545. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1546. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1547. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1548. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1549. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1550. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
  1551. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1552. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1553. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1554. cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
  1555. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1556. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1557. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
  1558. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1559. cuda/cccl/headers/include/thrust/random.h +118 -0
  1560. cuda/cccl/headers/include/thrust/reduce.h +1114 -0
  1561. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1562. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1563. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1564. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1565. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1566. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1567. cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
  1568. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1569. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1570. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1571. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1572. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1573. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1574. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1575. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1576. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1577. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1578. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1579. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1580. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1581. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1582. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1583. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1584. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1586. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1587. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1588. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1590. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1591. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1592. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1593. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1594. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1595. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1596. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1597. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1598. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1600. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1601. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1602. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1604. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1605. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1606. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1607. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1608. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1611. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1612. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1615. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1616. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1617. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1618. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1619. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1620. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1621. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1622. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
  1623. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1624. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1626. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1627. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1628. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
  1629. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
  1630. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
  1631. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1632. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1633. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
  1634. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1635. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1636. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1637. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
  1638. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1639. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1640. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1641. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1642. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
  1643. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1644. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
  1645. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1646. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1647. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1648. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1649. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1650. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
  1651. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1652. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1653. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1654. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1655. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
  1656. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
  1657. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1658. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1659. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1660. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
  1661. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
  1662. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
  1663. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
  1665. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
  1666. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
  1667. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
  1668. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1669. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1670. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1671. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1672. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1673. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
  1674. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
  1675. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
  1676. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
  1677. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1678. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1679. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1680. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1681. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1682. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1683. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1772. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1773. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1774. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1775. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1776. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1777. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1778. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
  1779. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1780. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1781. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1782. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1783. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1784. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1785. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1786. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1788. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1789. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1790. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1791. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1792. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
  1794. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1795. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1796. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
  1797. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1798. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1799. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1800. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1801. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1802. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1804. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1805. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1806. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
  1807. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1808. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1809. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1810. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1811. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1812. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1813. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1814. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1815. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1816. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1817. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1818. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
  1819. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
  1820. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1821. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1838. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1839. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1840. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1841. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1842. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1843. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1844. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
  1845. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1846. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
  1848. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
  1849. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1850. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1851. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1852. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1853. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
  1854. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1855. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1856. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1857. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1858. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1859. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1860. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1861. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1862. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1863. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1864. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1865. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1866. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1867. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
  1868. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
  1869. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1870. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1871. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1872. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1873. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1874. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1902. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1903. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1904. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1906. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1907. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1908. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1909. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
  1910. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1911. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1912. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1913. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1914. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1915. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1916. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1917. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1918. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
  1919. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
  1920. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1921. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1922. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1923. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1924. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1925. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1926. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1927. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1928. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1929. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1930. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1931. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
  1932. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
  1933. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1934. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1935. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1936. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
  1937. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1938. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1939. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1940. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1941. cuda/cccl/headers/include/thrust/unique.h +1089 -0
  1942. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1943. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1944. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1945. cuda/cccl/headers/include/thrust/version.h +93 -0
  1946. cuda/cccl/headers/include/thrust/zip_function.h +149 -0
  1947. cuda/cccl/headers/include_paths.py +51 -0
  1948. cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
  1949. cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
  1950. cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
  1951. cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
  1952. cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
  1953. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
  1954. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
  1955. cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
  1956. cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
  1957. cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
  1958. cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
  1959. cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
  1960. cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
  1961. cuda/cccl/parallel/__init__.py +9 -0
  1962. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1963. cuda/cccl/py.typed +0 -0
  1964. cuda/compute/__init__.py +91 -0
  1965. cuda/compute/_bindings.py +79 -0
  1966. cuda/compute/_bindings.pyi +516 -0
  1967. cuda/compute/_bindings_impl.pyx +2470 -0
  1968. cuda/compute/_caching.py +83 -0
  1969. cuda/compute/_cccl_interop.py +354 -0
  1970. cuda/compute/_odr_helpers.py +238 -0
  1971. cuda/compute/_utils/__init__.py +0 -0
  1972. cuda/compute/_utils/protocols.py +145 -0
  1973. cuda/compute/_utils/temp_storage_buffer.py +87 -0
  1974. cuda/compute/algorithms/__init__.py +62 -0
  1975. cuda/compute/algorithms/_histogram.py +243 -0
  1976. cuda/compute/algorithms/_reduce.py +205 -0
  1977. cuda/compute/algorithms/_scan.py +344 -0
  1978. cuda/compute/algorithms/_segmented_reduce.py +265 -0
  1979. cuda/compute/algorithms/_select.py +196 -0
  1980. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1981. cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
  1982. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1983. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1984. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1985. cuda/compute/algorithms/_three_way_partition.py +292 -0
  1986. cuda/compute/algorithms/_transform.py +317 -0
  1987. cuda/compute/algorithms/_unique_by_key.py +259 -0
  1988. cuda/compute/cccl/.gitkeep +0 -0
  1989. cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1990. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1991. cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  1992. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1993. cuda/compute/determinism.py +3 -0
  1994. cuda/compute/iterators/__init__.py +23 -0
  1995. cuda/compute/iterators/_factories.py +251 -0
  1996. cuda/compute/iterators/_iterators.py +680 -0
  1997. cuda/compute/iterators/_permutation_iterator.py +266 -0
  1998. cuda/compute/iterators/_zip_iterator.py +268 -0
  1999. cuda/compute/numba_utils.py +54 -0
  2000. cuda/compute/op.py +140 -0
  2001. cuda/compute/struct.py +520 -0
  2002. cuda/compute/typing.py +36 -0
  2003. cuda/coop/__init__.py +8 -0
  2004. cuda/coop/_caching.py +48 -0
  2005. cuda/coop/_common.py +275 -0
  2006. cuda/coop/_nvrtc.py +92 -0
  2007. cuda/coop/_scan_op.py +181 -0
  2008. cuda/coop/_types.py +937 -0
  2009. cuda/coop/_typing.py +107 -0
  2010. cuda/coop/block/__init__.py +39 -0
  2011. cuda/coop/block/_block_exchange.py +251 -0
  2012. cuda/coop/block/_block_load_store.py +215 -0
  2013. cuda/coop/block/_block_merge_sort.py +125 -0
  2014. cuda/coop/block/_block_radix_sort.py +214 -0
  2015. cuda/coop/block/_block_reduce.py +294 -0
  2016. cuda/coop/block/_block_scan.py +983 -0
  2017. cuda/coop/warp/__init__.py +9 -0
  2018. cuda/coop/warp/_warp_merge_sort.py +92 -0
  2019. cuda/coop/warp/_warp_reduce.py +153 -0
  2020. cuda/coop/warp/_warp_scan.py +78 -0
  2021. cuda_cccl-0.4.3.dist-info/METADATA +84 -0
  2022. cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
  2023. cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
  2024. cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2129 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ /**
6
+ * @file
7
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
8
+ * sorting of items partitioned across a CUDA thread block.
9
+ */
10
+
11
+ #pragma once
12
+
13
+ #include <cub/config.cuh>
14
+
15
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16
+ # pragma GCC system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18
+ # pragma clang system_header
19
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20
+ # pragma system_header
21
+ #endif // no system header
22
+
23
+ #include <cub/block/block_exchange.cuh>
24
+ #include <cub/block/block_radix_rank.cuh>
25
+ #include <cub/block/radix_rank_sort_operations.cuh>
26
+ #include <cub/util_ptx.cuh>
27
+ #include <cub/util_type.cuh>
28
+
29
+ #include <cuda/std/__algorithm/min.h>
30
+ #include <cuda/std/__type_traits/enable_if.h>
31
+ #include <cuda/std/__type_traits/integral_constant.h>
32
+ #include <cuda/std/__type_traits/is_convertible.h>
33
+ #include <cuda/std/__type_traits/is_same.h>
34
+
35
+ CUB_NAMESPACE_BEGIN
36
+
37
+ //! @rst
38
+ //! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
39
+ //! items partitioned across a CUDA thread block using a radix sorting method.
40
+ //!
41
+ //! .. image:: ../../img/sorting_logo.png
42
+ //! :align: center
43
+ //!
44
+ //! Overview
45
+ //! --------------------------------------------------
46
+ //!
47
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
48
+ //! items into ascending order. It relies upon a positional representation for
49
+ //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
50
+ //! characters, etc.) specified from least-significant to most-significant. For a
51
+ //! given input sequence of keys and a set of rules specifying a total ordering
52
+ //! of the symbolic alphabet, the radix sorting method produces a lexicographic
53
+ //! ordering of those keys.
54
+ //!
55
+ //! @rowmajor
56
+ //!
57
+ //! Supported Types
58
+ //! --------------------------------------------------
59
+ //!
60
+ //! BlockRadixSort can sort all of the built-in C++ numeric primitive types
61
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
62
+ //! half-precision floating-point type. User-defined types are supported as long
63
+ //! as decomposer object is provided.
64
+ //!
65
+ //! Floating-Point Special Cases
66
+ //! --------------------------------------------------
67
+ //!
68
+ //! - Positive and negative zeros are considered equivalent, and will be treated
69
+ //! as such in the output.
70
+ //! - No special handling is implemented for NaN values; these are sorted
71
+ //! according to their bit representations after any transformations.
72
+ //!
73
+ //! Bitwise Key Transformations
74
+ //! --------------------------------------------------
75
+ //!
76
+ //! Although the direct radix sorting method can only be applied to unsigned
77
+ //! integral types, BlockRadixSort is able to sort signed and floating-point
78
+ //! types via simple bit-wise transformations that ensure lexicographic key
79
+ //! ordering.
80
+ //!
81
+ //! These transformations must be considered when restricting the
82
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
83
+ //! before the bit-range truncation.
84
+ //!
85
+ //! Any transformations applied to the keys prior to sorting are reversed
86
+ //! while writing to the final output buffer.
87
+ //!
88
+ //! Type Specific Bitwise Transformations
89
+ //! --------------------------------------------------
90
+ //!
91
+ //! To convert the input values into a radix-sortable bitwise representation,
92
+ //! the following transformations take place prior to sorting:
93
+ //!
94
+ //! * For unsigned integral values, the keys are used directly.
95
+ //! * For signed integral values, the sign bit is inverted.
96
+ //! * For positive floating point values, the sign bit is inverted.
97
+ //! * For negative floating point values, the full key is inverted.
98
+ //!
99
+ //! No Descending Sort Transformations
100
+ //! --------------------------------------------------
101
+ //!
102
+ //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
103
+ //! when performing a descending sort. Instead, it has special logic to reverse
104
+ //! the order of the keys while sorting.
105
+ //!
106
+ //! Stability
107
+ //! --------------------------------------------------
108
+ //!
109
+ //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
110
+ //! are considered equal and appear in the result in the same order as they
111
+ //! appear in the input.
112
+ //!
113
+ //!
114
+ //! Performance Considerations
115
+ //! --------------------------------------------------
116
+ //!
117
+ //! * @granularity
118
+ //!
119
+ //! A Simple Example
120
+ //! --------------------------------------------------
121
+ //!
122
+ //! @blockcollective{BlockRadixSort}
123
+ //!
124
+ //! The code snippet below illustrates a sort of 512 integer keys that
125
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
126
+ //! where each thread owns 4 consecutive items.
127
+ //!
128
+ //! .. tab-set-code::
129
+ //!
130
+ //! .. code-block:: c++
131
+ //!
132
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
133
+ //!
134
+ //! __global__ void kernel(...)
135
+ //! {
136
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
137
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
138
+ //!
139
+ //! // Allocate shared memory for BlockRadixSort
140
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
141
+ //!
142
+ //! // Obtain a segment of consecutive items that are blocked across threads
143
+ //! int thread_keys[4];
144
+ //! ...
145
+ //!
146
+ //! // Collectively sort the keys
147
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
148
+ //!
149
+ //! ...
150
+ //!
151
+ //! .. code-block:: python
152
+ //!
153
+ //! from cuda import coop
154
+ //! from pynvjitlink import patch
155
+ //! patch.patch_numba_linker(lto=True)
156
+ //!
157
+ //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
158
+ //! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
159
+ //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
160
+ //!
161
+ //! @cuda.jit(link=block_radix_sort.files)
162
+ //! def kernel():
163
+ //! Allocate shared memory for radix sort
164
+ //! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
165
+ //!
166
+ //! # Obtain a segment of consecutive items that are blocked across threads
167
+ //! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
168
+ //! # ...
169
+ //!
170
+ //! // Collectively sort the keys
171
+ //! block_radix_sort(temp_storage, thread_keys)
172
+ //! # ...
173
+ //!
174
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
175
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
176
+ //! The corresponding output ``thread_keys`` in those threads will be
177
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
178
+ //!
179
+ //! Re-using dynamically allocating shared memory
180
+ //! --------------------------------------------------
181
+ //!
182
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
183
+ //! BlockReduce and how to re-purpose the same memory region.
184
+ //!
185
+ //! This example can be easily adapted to the storage required by BlockRadixSort.
186
+ //! @endrst
187
+ //!
188
+ //! @tparam KeyT
189
+ //! KeyT type
190
+ //!
191
+ //! @tparam BlockDimX
192
+ //! The thread block length in threads along the X dimension
193
+ //!
194
+ //! @tparam ItemsPerThread
195
+ //! The number of items per thread
196
+ //!
197
+ //! @tparam ValueT
198
+ //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
199
+ //!
200
+ //! @tparam RadixBits
201
+ //! **[optional]** The number of radix bits per digit place (default: 4 bits)
202
+ //!
203
+ //! @tparam MemoizeOuterScan
204
+ //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
205
+ //! reads at the expense of higher register pressure (default: true for architectures SM35 and
206
+ //! newer, false otherwise).
207
+ //!
208
+ //! @tparam InnerScanAlgorithm
209
+ //! **[optional]** The cub::BlockScanAlgorithm algorithm to use
210
+ //! (default: cub::BLOCK_SCAN_WARP_SCANS)
211
+ //!
212
+ //! @tparam SMemConfig
213
+ //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
214
+ //!
215
+ //! @tparam BlockDimY
216
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
217
+ //!
218
+ //! @tparam BlockDimZ
219
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
220
+ //!
221
+ template <typename KeyT,
222
+ int BlockDimX,
223
+ int ItemsPerThread,
224
+ typename ValueT = NullType,
225
+ int RadixBits = 4,
226
+ bool MemoizeOuterScan = true,
227
+ BlockScanAlgorithm InnerScanAlgorithm = BLOCK_SCAN_WARP_SCANS,
228
+ cudaSharedMemConfig SMemConfig = cudaSharedMemBankSizeFourByte,
229
+ int BlockDimY = 1,
230
+ int BlockDimZ = 1>
231
+ class BlockRadixSort
232
+ {
233
+ private:
234
+ /******************************************************************************
235
+ * Constants and type definitions
236
+ ******************************************************************************/
237
+
238
+ // The thread block size in threads
239
+ static constexpr int BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ;
240
+
241
+ // Whether or not there are values to be trucked along with keys
242
+ static constexpr bool KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>;
243
+
244
+ // KeyT traits and unsigned bits type
245
+ using traits = detail::radix::traits_t<KeyT>;
246
+ using bit_ordered_type = typename traits::bit_ordered_type;
247
+ using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
248
+
249
+ /// Ascending BlockRadixRank utility type
250
+ using AscendingBlockRadixRank =
251
+ BlockRadixRank<BlockDimX, RadixBits, false, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
252
+
253
+ /// Descending BlockRadixRank utility type
254
+ using DescendingBlockRadixRank =
255
+ BlockRadixRank<BlockDimX, RadixBits, true, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
256
+
257
+ /// Digit extractor type
258
+ using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
259
+
260
+ /// BlockExchange utility type for keys
261
+ using BlockExchangeKeys = BlockExchange<KeyT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
262
+
263
+ /// BlockExchange utility type for values
264
+ using BlockExchangeValues = BlockExchange<ValueT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
265
+
266
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
267
+ /// Shared memory storage layout type
268
+ union _TempStorage
269
+ {
270
+ typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
271
+ typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
272
+ typename BlockExchangeKeys::TempStorage exchange_keys;
273
+ typename BlockExchangeValues::TempStorage exchange_values;
274
+ };
275
+ #endif // _CCCL_DOXYGEN_INVOKED
276
+
277
+ /******************************************************************************
278
+ * Thread fields
279
+ ******************************************************************************/
280
+
281
+ /// Shared storage reference
282
+ _TempStorage& temp_storage;
283
+
284
+ /// Linear thread-id
285
+ unsigned int linear_tid;
286
+
287
+ /******************************************************************************
288
+ * Utility methods
289
+ ******************************************************************************/
290
+
291
+ /// Internal storage allocator
292
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
293
+ {
294
+ __shared__ _TempStorage private_storage;
295
+ return private_storage;
296
+ }
297
+
298
+ /// Rank keys (specialized for ascending sort)
299
+ template <class DigitExtractorT>
300
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
301
+ RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
302
+ int (&ranks)[ItemsPerThread],
303
+ DigitExtractorT digit_extractor,
304
+ ::cuda::std::false_type /*is_descending*/)
305
+ {
306
+ AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
307
+ }
308
+
309
+ /// Rank keys (specialized for descending sort)
310
+ template <class DigitExtractorT>
311
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
312
+ RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
313
+ int (&ranks)[ItemsPerThread],
314
+ DigitExtractorT digit_extractor,
315
+ ::cuda::std::true_type /*is_descending*/)
316
+ {
317
+ DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
318
+ }
319
+
320
+ /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
321
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
322
+ ValueT (&values)[ItemsPerThread],
323
+ int (&ranks)[ItemsPerThread],
324
+ ::cuda::std::false_type /*is_keys_only*/,
325
+ ::cuda::std::true_type /*is_blocked*/)
326
+ {
327
+ __syncthreads();
328
+
329
+ // Exchange values through shared memory in blocked arrangement
330
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
331
+ }
332
+
333
+ /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
334
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
335
+ ValueT (&values)[ItemsPerThread],
336
+ int (&ranks)[ItemsPerThread],
337
+ ::cuda::std::false_type /*is_keys_only*/,
338
+ ::cuda::std::false_type /*is_blocked*/)
339
+ {
340
+ __syncthreads();
341
+
342
+ // Exchange values through shared memory in blocked arrangement
343
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
344
+ }
345
+
346
+ /// ExchangeValues (specialized for keys-only sort)
347
+ template <bool IS_BLOCKED>
348
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
349
+ ValueT (& /*values*/)[ItemsPerThread],
350
+ int (& /*ranks*/)[ItemsPerThread],
351
+ ::cuda::std::true_type /*is_keys_only*/,
352
+ ::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
353
+ {}
354
+
355
+ /**
356
+ * @brief Sort blocked arrangement
357
+ *
358
+ * @param keys
359
+ * Keys to sort
360
+ *
361
+ * @param values
362
+ * Values to sort
363
+ *
364
+ * @param begin_bit
365
+ * The beginning (least-significant) bit index needed for key comparison
366
+ *
367
+ * @param end_bit
368
+ * The past-the-end (most-significant) bit index needed for key comparison
369
+ *
370
+ * @param is_descending
371
+ * Tag whether is a descending-order sort
372
+ *
373
+ * @param is_keys_only
374
+ * Tag whether is keys-only sort
375
+ */
376
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
377
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
378
+ KeyT (&keys)[ItemsPerThread],
379
+ ValueT (&values)[ItemsPerThread],
380
+ int begin_bit,
381
+ int end_bit,
382
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
383
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
384
+ DecomposerT decomposer = {})
385
+ {
386
+ bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
387
+
388
+ _CCCL_PRAGMA_UNROLL_FULL()
389
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
390
+ {
391
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
392
+ }
393
+
394
+ // Radix sorting passes
395
+ while (true)
396
+ {
397
+ int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
398
+ auto digit_extractor =
399
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
400
+
401
+ // Rank the blocked keys
402
+ int ranks[ItemsPerThread];
403
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
404
+ begin_bit += RadixBits;
405
+
406
+ __syncthreads();
407
+
408
+ // Exchange keys through shared memory in blocked arrangement
409
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
410
+
411
+ // Exchange values through shared memory in blocked arrangement
412
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
413
+
414
+ // Quit if done
415
+ if (begin_bit >= end_bit)
416
+ {
417
+ break;
418
+ }
419
+
420
+ __syncthreads();
421
+ }
422
+
423
+ // Untwiddle bits if necessary
424
+ _CCCL_PRAGMA_UNROLL_FULL()
425
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
426
+ {
427
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
428
+ }
429
+ }
430
+
431
+ public:
432
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
433
+
434
+ /**
435
+ * @brief Sort blocked -> striped arrangement
436
+ *
437
+ * @param keys
438
+ * Keys to sort
439
+ *
440
+ * @param values
441
+ * Values to sort
442
+ *
443
+ * @param begin_bit
444
+ * The beginning (least-significant) bit index needed for key comparison
445
+ *
446
+ * @param end_bit
447
+ * The past-the-end (most-significant) bit index needed for key comparison
448
+ *
449
+ * @param is_descending
450
+ * Tag whether is a descending-order sort
451
+ *
452
+ * @param is_keys_only
453
+ * Tag whether is keys-only sort
454
+ */
455
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
456
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
457
+ KeyT (&keys)[ItemsPerThread],
458
+ ValueT (&values)[ItemsPerThread],
459
+ int begin_bit,
460
+ int end_bit,
461
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
462
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
463
+ DecomposerT decomposer = {})
464
+ {
465
+ bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
466
+
467
+ _CCCL_PRAGMA_UNROLL_FULL()
468
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
469
+ {
470
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
471
+ }
472
+
473
+ // Radix sorting passes
474
+ while (true)
475
+ {
476
+ int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
477
+ auto digit_extractor =
478
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
479
+
480
+ // Rank the blocked keys
481
+ int ranks[ItemsPerThread];
482
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
483
+ begin_bit += RadixBits;
484
+
485
+ __syncthreads();
486
+
487
+ // Check if this is the last pass
488
+ if (begin_bit >= end_bit)
489
+ {
490
+ // Last pass exchanges keys through shared memory in striped arrangement
491
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
492
+
493
+ // Last pass exchanges through shared memory in striped arrangement
494
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::false_type());
495
+
496
+ // Quit
497
+ break;
498
+ }
499
+
500
+ // Exchange keys through shared memory in blocked arrangement
501
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
502
+
503
+ // Exchange values through shared memory in blocked arrangement
504
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
505
+
506
+ __syncthreads();
507
+ }
508
+
509
+ // Untwiddle bits if necessary
510
+ _CCCL_PRAGMA_UNROLL_FULL()
511
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
512
+ {
513
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
514
+ }
515
+ }
516
+
517
+ #endif // _CCCL_DOXYGEN_INVOKED
518
+
519
+ /// @smemstorage{BlockRadixSort}
520
+ struct TempStorage : Uninitialized<_TempStorage>
521
+ {};
522
+
523
+ //! @name Collective constructors
524
+ //! @{
525
+
526
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
527
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
528
+ : temp_storage(PrivateStorage())
529
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
530
+ {}
531
+
532
+ /**
533
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
534
+ *
535
+ * @param[in] temp_storage
536
+ * Reference to memory allocation having layout type TempStorage
537
+ */
538
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
539
+ : temp_storage(temp_storage.Alias())
540
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
541
+ {}
542
+
543
+ //! @} end member group
544
+ //! @name Sorting (blocked arrangements)
545
+ //! @{
546
+
547
+ //! @rst
548
+ //! Performs an ascending block-wide radix sort over a
549
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
550
+ //!
551
+ //! - @granularity
552
+ //! - @smemreuse
553
+ //!
554
+ //! Snippet
555
+ //! +++++++
556
+ //!
557
+ //! The code snippet below illustrates a sort of 512 integer keys that
558
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
559
+ //! where each thread owns 4 consecutive keys.
560
+ //!
561
+ //! .. code-block:: c++
562
+ //!
563
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
564
+ //!
565
+ //! __global__ void ExampleKernel(...)
566
+ //! {
567
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
568
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
569
+ //!
570
+ //! // Allocate shared memory for BlockRadixSort
571
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
572
+ //!
573
+ //! // Obtain a segment of consecutive items that are blocked across threads
574
+ //! int thread_keys[4];
575
+ //! ...
576
+ //!
577
+ //! // Collectively sort the keys
578
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
579
+ //!
580
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
581
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
582
+ //! The corresponding output ``thread_keys`` in those threads will be
583
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
584
+ //! @endrst
585
+ //!
586
+ //! @param[in,out] keys
587
+ //! Keys to sort
588
+ //!
589
+ //! @param[in] begin_bit
590
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
591
+ //!
592
+ //! @param[in] end_bit
593
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
594
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
595
+ Sort(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
596
+ {
597
+ NullType values[ItemsPerThread];
598
+
599
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
600
+ }
601
+
602
+ //! @rst
603
+ //! Performs an ascending block-wide radix sort over a
604
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
605
+ //!
606
+ //! * @granularity
607
+ //! * @smemreuse
608
+ //!
609
+ //! Snippet
610
+ //! ==========================================================================
611
+ //!
612
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
613
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
614
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
615
+ //! tuple of references to relevant members of the key.
616
+ //!
617
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
618
+ //! :language: c++
619
+ //! :dedent:
620
+ //! :start-after: example-begin custom-type
621
+ //! :end-before: example-end custom-type
622
+ //!
623
+ //! The code snippet below illustrates a sort of 2 keys that
624
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
625
+ //! where each thread owns 1 key.
626
+ //!
627
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
628
+ //! :language: c++
629
+ //! :dedent:
630
+ //! :start-after: example-begin keys-bits
631
+ //! :end-before: example-end keys-bits
632
+ //!
633
+ //! @endrst
634
+ //!
635
+ //! @tparam DecomposerT
636
+ //! **[inferred]** Type of a callable object responsible for decomposing a
637
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
638
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
639
+ //! The leftmost element of the tuple is considered the most significant.
640
+ //! The call operator must not modify members of the key.
641
+ //!
642
+ //! @param[in,out] keys
643
+ //! Keys to sort
644
+ //!
645
+ //! @param decomposer
646
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
647
+ //! references to its constituent arithmetic types. The leftmost element of
648
+ //! the tuple is considered the most significant. The call operator must not
649
+ //! modify members of the key.
650
+ //!
651
+ //! @param[in] begin_bit
652
+ //! The least-significant bit index (inclusive) needed for
653
+ //! key comparison
654
+ //!
655
+ //! @param[in] end_bit
656
+ //! The most-significant bit index (exclusive) needed for key
657
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
658
+ template <class DecomposerT>
659
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
660
+ ::cuda::std::enable_if_t< //
661
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
662
+ Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
663
+ {
664
+ NullType values[ItemsPerThread];
665
+
666
+ SortBlocked(
667
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
668
+ }
669
+
670
+ //! @rst
671
+ //! Performs an ascending block-wide radix sort over a
672
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
673
+ //!
674
+ //! * @granularity
675
+ //! * @smemreuse
676
+ //!
677
+ //! Snippet
678
+ //! ==========================================================================
679
+ //!
680
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
681
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
682
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
683
+ //! tuple of references to relevant members of the key.
684
+ //!
685
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
686
+ //! :language: c++
687
+ //! :dedent:
688
+ //! :start-after: example-begin custom-type
689
+ //! :end-before: example-end custom-type
690
+ //!
691
+ //! The code snippet below illustrates a sort of 6 keys that
692
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
693
+ //! where each thread owns 3 consecutive keys.
694
+ //!
695
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
696
+ //! :language: c++
697
+ //! :dedent:
698
+ //! :start-after: example-begin keys
699
+ //! :end-before: example-end keys
700
+ //!
701
+ //! @endrst
702
+ //!
703
+ //! @tparam DecomposerT
704
+ //! **[inferred]** Type of a callable object responsible for decomposing a
705
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
706
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
707
+ //! The leftmost element of the tuple is considered the most significant.
708
+ //! The call operator must not modify members of the key.
709
+ //!
710
+ //! @param[in,out] keys
711
+ //! Keys to sort
712
+ //!
713
+ //! @param decomposer
714
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
715
+ //! references to its constituent arithmetic types. The leftmost element of
716
+ //! the tuple is considered the most significant. The call operator must not
717
+ //! modify members of the key.
718
+ template <class DecomposerT>
719
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
720
+ ::cuda::std::enable_if_t< //
721
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
722
+ Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
723
+ {
724
+ Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
725
+ }
726
+
727
+ //! @rst
728
+ //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
729
+ //! of keys and values.
730
+ //!
731
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
732
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
733
+ //! with a temporary value array that enumerates the key indices. The reordered indices
734
+ //! can then be used as a gather-vector for exchanging other associated tile data through
735
+ //! shared memory.
736
+ //! - @granularity
737
+ //! - @smemreuse
738
+ //!
739
+ //! Snippet
740
+ //! +++++++
741
+ //!
742
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
743
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
744
+ //! where each thread owns 4 consecutive pairs.
745
+ //!
746
+ //! .. code-block:: c++
747
+ //!
748
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
749
+ //!
750
+ //! __global__ void ExampleKernel(...)
751
+ //! {
752
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
753
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
754
+ //!
755
+ //! // Allocate shared memory for BlockRadixSort
756
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
757
+ //!
758
+ //! // Obtain a segment of consecutive items that are blocked across threads
759
+ //! int thread_keys[4];
760
+ //! int thread_values[4];
761
+ //! ...
762
+ //!
763
+ //! // Collectively sort the keys and values among block threads
764
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
765
+ //!
766
+ //! @endcode
767
+ //! @par
768
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
769
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
770
+ //! corresponding output ``thread_keys`` in those threads will be
771
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
772
+ //!
773
+ //! @endrst
774
+ //!
775
+ //! @param[in,out] keys
776
+ //! Keys to sort
777
+ //!
778
+ //! @param[in,out] values
779
+ //! Values to sort
780
+ //!
781
+ //! @param[in] begin_bit
782
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
783
+ //!
784
+ //! @param[in] end_bit
785
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
786
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(
787
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
788
+ {
789
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
790
+ }
791
+
792
+ //! @rst
793
+ //! Performs an ascending block-wide radix sort over a
794
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
795
+ //!
796
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
797
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
798
+ //! with a temporary value array that enumerates the key indices. The reordered indices
799
+ //! can then be used as a gather-vector for exchanging other associated tile data through
800
+ //! shared memory.
801
+ //! * @granularity
802
+ //! * @smemreuse
803
+ //!
804
+ //! Snippet
805
+ //! ==========================================================================
806
+ //!
807
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
808
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
809
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
810
+ //! tuple of references to relevant members of the key.
811
+ //!
812
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
813
+ //! :language: c++
814
+ //! :dedent:
815
+ //! :start-after: example-begin custom-type
816
+ //! :end-before: example-end custom-type
817
+ //!
818
+ //! The code snippet below illustrates a sort of 2 keys and values that
819
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
820
+ //! where each thread owns 1 pair.
821
+ //!
822
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
823
+ //! :language: c++
824
+ //! :dedent:
825
+ //! :start-after: example-begin pairs-bits
826
+ //! :end-before: example-end pairs-bits
827
+ //!
828
+ //! @endrst
829
+ //!
830
+ //! @tparam DecomposerT
831
+ //! **[inferred]** Type of a callable object responsible for decomposing a
832
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
833
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
834
+ //! The leftmost element of the tuple is considered the most significant.
835
+ //! The call operator must not modify members of the key.
836
+ //!
837
+ //! @param[in,out] keys
838
+ //! Keys to sort
839
+ //!
840
+ //! @param[in,out] values
841
+ //! Values to sort
842
+ //!
843
+ //! @param decomposer
844
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
845
+ //! references to its constituent arithmetic types. The leftmost element of
846
+ //! the tuple is considered the most significant. The call operator must not
847
+ //! modify members of the key.
848
+ //!
849
+ //! @param[in] begin_bit
850
+ //! The least-significant bit index (inclusive) needed for
851
+ //! key comparison
852
+ //!
853
+ //! @param[in] end_bit
854
+ //! The most-significant bit index (exclusive) needed for key
855
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
856
+ template <class DecomposerT>
857
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
858
+ ::cuda::std::enable_if_t< //
859
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
860
+ Sort(
861
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
862
+ {
863
+ SortBlocked(
864
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
865
+ }
866
+
867
+ //! @rst
868
+ //! Performs an ascending block-wide radix sort over a
869
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
870
+ //!
871
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
872
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
873
+ //! with a temporary value array that enumerates the key indices. The reordered indices
874
+ //! can then be used as a gather-vector for exchanging other associated tile data through
875
+ //! shared memory.
876
+ //! * @granularity
877
+ //! * @smemreuse
878
+ //!
879
+ //! Snippet
880
+ //! ==========================================================================
881
+ //!
882
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
883
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
884
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
885
+ //! tuple of references to relevant members of the key.
886
+ //!
887
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
888
+ //! :language: c++
889
+ //! :dedent:
890
+ //! :start-after: example-begin custom-type
891
+ //! :end-before: example-end custom-type
892
+ //!
893
+ //! The code snippet below illustrates a sort of 6 keys and values that
894
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
895
+ //! where each thread owns 3 consecutive pairs.
896
+ //!
897
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
898
+ //! :language: c++
899
+ //! :dedent:
900
+ //! :start-after: example-begin pairs
901
+ //! :end-before: example-end pairs
902
+ //!
903
+ //! @endrst
904
+ //!
905
+ //! @tparam DecomposerT
906
+ //! **[inferred]** Type of a callable object responsible for decomposing a
907
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
908
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
909
+ //! The leftmost element of the tuple is considered the most significant.
910
+ //! The call operator must not modify members of the key.
911
+ //!
912
+ //! @param[in,out] keys
913
+ //! Keys to sort
914
+ //!
915
+ //! @param[in,out] values
916
+ //! Values to sort
917
+ //!
918
+ //! @param decomposer
919
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
920
+ //! references to its constituent arithmetic types. The leftmost element of
921
+ //! the tuple is considered the most significant. The call operator must not
922
+ //! modify members of the key.
923
+ template <class DecomposerT>
924
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
925
+ ::cuda::std::enable_if_t< //
926
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
927
+ Sort(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
928
+ {
929
+ Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
930
+ }
931
+
932
+ //! @rst
933
+ //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
934
+ //! of keys.
935
+ //!
936
+ //! - @granularity
937
+ //! - @smemreuse
938
+ //!
939
+ //! Snippet
940
+ //! +++++++
941
+ //!
942
+ //! The code snippet below illustrates a sort of 512 integer keys that
943
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
944
+ //! where each thread owns 4 consecutive keys.
945
+ //!
946
+ //! .. code-block:: c++
947
+ //!
948
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
949
+ //!
950
+ //! __global__ void ExampleKernel(...)
951
+ //! {
952
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
953
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
954
+ //!
955
+ //! // Allocate shared memory for BlockRadixSort
956
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
957
+ //!
958
+ //! // Obtain a segment of consecutive items that are blocked across threads
959
+ //! int thread_keys[4];
960
+ //! ...
961
+ //!
962
+ //! // Collectively sort the keys
963
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
964
+ //!
965
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
966
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
967
+ //! The corresponding output ``thread_keys`` in those threads will be
968
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
969
+ //!
970
+ //! @endrst
971
+ //!
972
+ //! @param[in,out] keys
973
+ //! Keys to sort
974
+ //!
975
+ //! @param[in] begin_bit
976
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
977
+ //!
978
+ //! @param[in] end_bit
979
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
980
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
981
+ SortDescending(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
982
+ {
983
+ NullType values[ItemsPerThread];
984
+
985
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
986
+ }
987
+
988
+ //! @rst
989
+ //! Performs a descending block-wide radix sort over a
990
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
991
+ //!
992
+ //! * @granularity
993
+ //! * @smemreuse
994
+ //!
995
+ //! Snippet
996
+ //! ==========================================================================
997
+ //!
998
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
999
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1000
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1001
+ //! tuple of references to relevant members of the key.
1002
+ //!
1003
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1004
+ //! :language: c++
1005
+ //! :dedent:
1006
+ //! :start-after: example-begin custom-type
1007
+ //! :end-before: example-end custom-type
1008
+ //!
1009
+ //! The code snippet below illustrates a sort of 2 keys that
1010
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1011
+ //! where each thread owns 1 key.
1012
+ //!
1013
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1014
+ //! :language: c++
1015
+ //! :dedent:
1016
+ //! :start-after: example-begin keys-descending-bits
1017
+ //! :end-before: example-end keys-descending-bits
1018
+ //!
1019
+ //! @endrst
1020
+ //!
1021
+ //! @tparam DecomposerT
1022
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1023
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1024
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1025
+ //! The leftmost element of the tuple is considered the most significant.
1026
+ //! The call operator must not modify members of the key.
1027
+ //!
1028
+ //! @param[in,out] keys
1029
+ //! Keys to sort
1030
+ //!
1031
+ //! @param decomposer
1032
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1033
+ //! references to its constituent arithmetic types. The leftmost element of
1034
+ //! the tuple is considered the most significant. The call operator must not
1035
+ //! modify members of the key.
1036
+ //!
1037
+ //! @param[in] begin_bit
1038
+ //! The least-significant bit index (inclusive) needed for
1039
+ //! key comparison
1040
+ //!
1041
+ //! @param[in] end_bit
1042
+ //! The most-significant bit index (exclusive) needed for key
1043
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1044
+ template <class DecomposerT>
1045
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1046
+ ::cuda::std::enable_if_t< //
1047
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1048
+ SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1049
+ {
1050
+ NullType values[ItemsPerThread];
1051
+
1052
+ SortBlocked(
1053
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1054
+ }
1055
+
1056
+ //! @rst
1057
+ //! Performs a descending block-wide radix sort over a
1058
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1059
+ //!
1060
+ //! * @granularity
1061
+ //! * @smemreuse
1062
+ //!
1063
+ //! Snippet
1064
+ //! ==========================================================================
1065
+ //!
1066
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1067
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1068
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1069
+ //! tuple of references to relevant members of the key.
1070
+ //!
1071
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1072
+ //! :language: c++
1073
+ //! :dedent:
1074
+ //! :start-after: example-begin custom-type
1075
+ //! :end-before: example-end custom-type
1076
+ //!
1077
+ //! The code snippet below illustrates a sort of 6 keys that
1078
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1079
+ //! where each thread owns 3 consecutive keys.
1080
+ //!
1081
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1082
+ //! :language: c++
1083
+ //! :dedent:
1084
+ //! :start-after: example-begin keys-descending
1085
+ //! :end-before: example-end keys-descending
1086
+ //!
1087
+ //! @endrst
1088
+ //!
1089
+ //! @tparam DecomposerT
1090
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1091
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1092
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1093
+ //! The leftmost element of the tuple is considered the most significant.
1094
+ //! The call operator must not modify members of the key.
1095
+ //!
1096
+ //! @param[in,out] keys
1097
+ //! Keys to sort
1098
+ //!
1099
+ //! @param decomposer
1100
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1101
+ //! references to its constituent arithmetic types. The leftmost element of
1102
+ //! the tuple is considered the most significant. The call operator must not
1103
+ //! modify members of the key.
1104
+ template <class DecomposerT>
1105
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1106
+ ::cuda::std::enable_if_t< //
1107
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1108
+ SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1109
+ {
1110
+ NullType values[ItemsPerThread];
1111
+
1112
+ SortBlocked(
1113
+ keys,
1114
+ values,
1115
+ 0,
1116
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1117
+ ::cuda::std::true_type(),
1118
+ detail::bool_constant_v<KEYS_ONLY>,
1119
+ decomposer);
1120
+ }
1121
+
1122
+ //! @rst
1123
+ //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1124
+ //! of keys and values.
1125
+ //!
1126
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1127
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1128
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1129
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1130
+ //! shared memory.
1131
+ //! - @granularity
1132
+ //! - @smemreuse
1133
+ //!
1134
+ //! Snippet
1135
+ //! +++++++
1136
+ //!
1137
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1138
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1139
+ //! where each thread owns 4 consecutive pairs.
1140
+ //!
1141
+ //! .. code-block:: c++
1142
+ //!
1143
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1144
+ //!
1145
+ //! __global__ void ExampleKernel(...)
1146
+ //! {
1147
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1148
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1149
+ //!
1150
+ //! // Allocate shared memory for BlockRadixSort
1151
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1152
+ //!
1153
+ //! // Obtain a segment of consecutive items that are blocked across threads
1154
+ //! int thread_keys[4];
1155
+ //! int thread_values[4];
1156
+ //! ...
1157
+ //!
1158
+ //! // Collectively sort the keys and values among block threads
1159
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
1160
+ //!
1161
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1162
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
1163
+ //! corresponding output ``thread_keys`` in those threads will be
1164
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1165
+ //!
1166
+ //! @endrst
1167
+ //!
1168
+ //! @param[in,out] keys
1169
+ //! Keys to sort
1170
+ //!
1171
+ //! @param[in,out] values
1172
+ //! Values to sort
1173
+ //!
1174
+ //! @param[in] begin_bit
1175
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1176
+ //!
1177
+ //! @param[in] end_bit
1178
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1179
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
1180
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1181
+ {
1182
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1183
+ }
1184
+
1185
+ //! @rst
1186
+ //! Performs a descending block-wide radix sort over a
1187
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1188
+ //!
1189
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1190
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1191
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1192
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1193
+ //! shared memory.
1194
+ //! * @granularity
1195
+ //! * @smemreuse
1196
+ //!
1197
+ //! Snippet
1198
+ //! ==========================================================================
1199
+ //!
1200
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1201
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1202
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1203
+ //! tuple of references to relevant members of the key.
1204
+ //!
1205
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1206
+ //! :language: c++
1207
+ //! :dedent:
1208
+ //! :start-after: example-begin custom-type
1209
+ //! :end-before: example-end custom-type
1210
+ //!
1211
+ //! The code snippet below illustrates a sort of 2 pairs that
1212
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1213
+ //! where each thread owns 1 pair.
1214
+ //!
1215
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1216
+ //! :language: c++
1217
+ //! :dedent:
1218
+ //! :start-after: example-begin pairs-descending-bits
1219
+ //! :end-before: example-end pairs-descending-bits
1220
+ //!
1221
+ //! @endrst
1222
+ //!
1223
+ //! @tparam DecomposerT
1224
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1225
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1226
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1227
+ //! The leftmost element of the tuple is considered the most significant.
1228
+ //! The call operator must not modify members of the key.
1229
+ //!
1230
+ //! @param[in,out] keys
1231
+ //! Keys to sort
1232
+ //!
1233
+ //! @param[in,out] values
1234
+ //! Values to sort
1235
+ //!
1236
+ //! @param decomposer
1237
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1238
+ //! references to its constituent arithmetic types. The leftmost element of
1239
+ //! the tuple is considered the most significant. The call operator must not
1240
+ //! modify members of the key.
1241
+ //!
1242
+ //! @param[in] begin_bit
1243
+ //! The least-significant bit index (inclusive) needed for
1244
+ //! key comparison
1245
+ //!
1246
+ //! @param[in] end_bit
1247
+ //! The most-significant bit index (exclusive) needed for key
1248
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1249
+ template <class DecomposerT>
1250
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1251
+ ::cuda::std::enable_if_t< //
1252
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1253
+ SortDescending(
1254
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1255
+ {
1256
+ SortBlocked(
1257
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1258
+ }
1259
+
1260
+ //! @rst
1261
+ //! Performs a descending block-wide radix sort over a
1262
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1263
+ //!
1264
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1265
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1266
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1267
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1268
+ //! shared memory.
1269
+ //! * @granularity
1270
+ //! * @smemreuse
1271
+ //!
1272
+ //! Snippet
1273
+ //! ==========================================================================
1274
+ //!
1275
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1276
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1277
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1278
+ //! tuple of references to relevant members of the key.
1279
+ //!
1280
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1281
+ //! :language: c++
1282
+ //! :dedent:
1283
+ //! :start-after: example-begin custom-type
1284
+ //! :end-before: example-end custom-type
1285
+ //!
1286
+ //! The code snippet below illustrates a sort of 6 keys and values that
1287
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1288
+ //! where each thread owns 3 consecutive pairs.
1289
+ //!
1290
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1291
+ //! :language: c++
1292
+ //! :dedent:
1293
+ //! :start-after: example-begin pairs-descending
1294
+ //! :end-before: example-end pairs-descending
1295
+ //!
1296
+ //! @endrst
1297
+ //!
1298
+ //! @tparam DecomposerT
1299
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1300
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1301
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1302
+ //! The leftmost element of the tuple is considered the most significant.
1303
+ //! The call operator must not modify members of the key.
1304
+ //!
1305
+ //! @param[in,out] keys
1306
+ //! Keys to sort
1307
+ //!
1308
+ //! @param[in,out] values
1309
+ //! Values to sort
1310
+ //!
1311
+ //! @param decomposer
1312
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1313
+ //! references to its constituent arithmetic types. The leftmost element of
1314
+ //! the tuple is considered the most significant. The call operator must not
1315
+ //! modify members of the key.
1316
+ template <class DecomposerT>
1317
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1318
+ ::cuda::std::enable_if_t< //
1319
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1320
+ SortDescending(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
1321
+ {
1322
+ SortBlocked(
1323
+ keys,
1324
+ values,
1325
+ 0,
1326
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1327
+ ::cuda::std::true_type(),
1328
+ detail::bool_constant_v<KEYS_ONLY>,
1329
+ decomposer);
1330
+ }
1331
+
1332
+ //! @} end member group
1333
+ //! @name Sorting (blocked arrangement -> striped arrangement)
1334
+ //! @{
1335
+
1336
+ //! @rst
1337
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
1338
+ //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1339
+ //!
1340
+ //! - @granularity
1341
+ //! - @smemreuse
1342
+ //!
1343
+ //! Snippet
1344
+ //! +++++++
1345
+ //!
1346
+ //! The code snippet below illustrates a sort of 512 integer keys that
1347
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1348
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1349
+ //!
1350
+ //! .. code-block:: c++
1351
+ //!
1352
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1353
+ //!
1354
+ //! __global__ void ExampleKernel(...)
1355
+ //! {
1356
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1357
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1358
+ //!
1359
+ //! // Allocate shared memory for BlockRadixSort
1360
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1361
+ //!
1362
+ //! // Obtain a segment of consecutive items that are blocked across threads
1363
+ //! int thread_keys[4];
1364
+ //! ...
1365
+ //!
1366
+ //! // Collectively sort the keys
1367
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1368
+ //!
1369
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1370
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1371
+ //! The corresponding output ``thread_keys`` in those threads will be
1372
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1373
+ //!
1374
+ //! @endrst
1375
+ //!
1376
+ //! @param[in,out] keys
1377
+ //! Keys to sort
1378
+ //!
1379
+ //! @param[in] begin_bit
1380
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1381
+ //!
1382
+ //! @param[in] end_bit
1383
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1384
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1385
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1386
+ {
1387
+ NullType values[ItemsPerThread];
1388
+
1389
+ SortBlockedToStriped(
1390
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1391
+ }
1392
+
1393
+ //! @rst
1394
+ //! Performs an ascending block-wide radix sort over a
1395
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1396
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1397
+ //!
1398
+ //! * @granularity
1399
+ //! * @smemreuse
1400
+ //!
1401
+ //! Snippet
1402
+ //! ==========================================================================
1403
+ //!
1404
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1405
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1406
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1407
+ //! tuple of references to relevant members of the key.
1408
+ //!
1409
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1410
+ //! :language: c++
1411
+ //! :dedent:
1412
+ //! :start-after: example-begin custom-type
1413
+ //! :end-before: example-end custom-type
1414
+ //!
1415
+ //! The code snippet below illustrates a sort of 4 keys that
1416
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1417
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1418
+ //!
1419
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1420
+ //! :language: c++
1421
+ //! :dedent:
1422
+ //! :start-after: example-begin keys-striped-bits
1423
+ //! :end-before: example-end keys-striped-bits
1424
+ //!
1425
+ //! @endrst
1426
+ //!
1427
+ //! @tparam DecomposerT
1428
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1429
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1430
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1431
+ //! The leftmost element of the tuple is considered the most significant.
1432
+ //! The call operator must not modify members of the key.
1433
+ //!
1434
+ //! @param[in,out] keys
1435
+ //! Keys to sort
1436
+ //!
1437
+ //! @param decomposer
1438
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1439
+ //! references to its constituent arithmetic types. The leftmost element of
1440
+ //! the tuple is considered the most significant. The call operator must not
1441
+ //! modify members of the key.
1442
+ //!
1443
+ //! @param[in] begin_bit
1444
+ //! The least-significant bit index (inclusive) needed for
1445
+ //! key comparison
1446
+ //!
1447
+ //! @param[in] end_bit
1448
+ //! The most-significant bit index (exclusive) needed for key
1449
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1450
+ template <class DecomposerT>
1451
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1452
+ ::cuda::std::enable_if_t< //
1453
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1454
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1455
+ {
1456
+ NullType values[ItemsPerThread];
1457
+
1458
+ SortBlockedToStriped(
1459
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1460
+ }
1461
+
1462
+ //! @rst
1463
+ //! Performs an ascending block-wide radix sort over a
1464
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1465
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1466
+ //!
1467
+ //! * @granularity
1468
+ //! * @smemreuse
1469
+ //!
1470
+ //! Snippet
1471
+ //! ==========================================================================
1472
+ //!
1473
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1474
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1475
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1476
+ //! tuple of references to relevant members of the key.
1477
+ //!
1478
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1479
+ //! :language: c++
1480
+ //! :dedent:
1481
+ //! :start-after: example-begin custom-type
1482
+ //! :end-before: example-end custom-type
1483
+ //!
1484
+ //! The code snippet below illustrates a sort of 6 keys that
1485
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1486
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1487
+ //!
1488
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1489
+ //! :language: c++
1490
+ //! :dedent:
1491
+ //! :start-after: example-begin keys-striped
1492
+ //! :end-before: example-end keys-striped
1493
+ //!
1494
+ //! @endrst
1495
+ //!
1496
+ //! @tparam DecomposerT
1497
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1498
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1499
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1500
+ //! The leftmost element of the tuple is considered the most significant.
1501
+ //! The call operator must not modify members of the key.
1502
+ //!
1503
+ //! @param[in,out] keys
1504
+ //! Keys to sort
1505
+ //!
1506
+ //! @param decomposer
1507
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1508
+ //! references to its constituent arithmetic types. The leftmost element of
1509
+ //! the tuple is considered the most significant. The call operator must not
1510
+ //! modify members of the key.
1511
+ template <class DecomposerT>
1512
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1513
+ ::cuda::std::enable_if_t< //
1514
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1515
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1516
+ {
1517
+ NullType values[ItemsPerThread];
1518
+
1519
+ SortBlockedToStriped(
1520
+ keys,
1521
+ values,
1522
+ 0,
1523
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1524
+ ::cuda::std::false_type(),
1525
+ detail::bool_constant_v<KEYS_ONLY>,
1526
+ decomposer);
1527
+ }
1528
+
1529
+ //! @rst
1530
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
1531
+ //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1532
+ //!
1533
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1534
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1535
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1536
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1537
+ //! shared memory.
1538
+ //! - @granularity
1539
+ //! - @smemreuse
1540
+ //!
1541
+ //! Snippet
1542
+ //! +++++++
1543
+ //!
1544
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1545
+ //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
1546
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1547
+ //!
1548
+ //! .. code-block:: c++
1549
+ //!
1550
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1551
+ //!
1552
+ //! __global__ void ExampleKernel(...)
1553
+ //! {
1554
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1555
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1556
+ //!
1557
+ //! // Allocate shared memory for BlockRadixSort
1558
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1559
+ //!
1560
+ //! // Obtain a segment of consecutive items that are blocked across threads
1561
+ //! int thread_keys[4];
1562
+ //! int thread_values[4];
1563
+ //! ...
1564
+ //!
1565
+ //! // Collectively sort the keys and values among block threads
1566
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1567
+ //!
1568
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1569
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1570
+ //! The corresponding output ``thread_keys`` in those threads will be
1571
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1572
+ //!
1573
+ //! @endrst
1574
+ //!
1575
+ //! @param[in,out] keys
1576
+ //! Keys to sort
1577
+ //!
1578
+ //! @param[in,out] values
1579
+ //! Values to sort
1580
+ //!
1581
+ //! @param[in] begin_bit
1582
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1583
+ //!
1584
+ //! @param[in] end_bit
1585
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1586
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
1587
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1588
+ {
1589
+ SortBlockedToStriped(
1590
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1591
+ }
1592
+
1593
+ //! @rst
1594
+ //! Performs an ascending block-wide radix sort over a
1595
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1596
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1597
+ //!
1598
+ //! * @granularity
1599
+ //! * @smemreuse
1600
+ //!
1601
+ //! Snippet
1602
+ //! ==========================================================================
1603
+ //!
1604
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1605
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1606
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1607
+ //! tuple of references to relevant members of the key.
1608
+ //!
1609
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1610
+ //! :language: c++
1611
+ //! :dedent:
1612
+ //! :start-after: example-begin custom-type
1613
+ //! :end-before: example-end custom-type
1614
+ //!
1615
+ //! The code snippet below illustrates a sort of 4 pairs that
1616
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1617
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
1618
+ //!
1619
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1620
+ //! :language: c++
1621
+ //! :dedent:
1622
+ //! :start-after: example-begin pairs-striped-bits
1623
+ //! :end-before: example-end pairs-striped-bits
1624
+ //!
1625
+ //! @endrst
1626
+ //!
1627
+ //! @tparam DecomposerT
1628
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1629
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1630
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1631
+ //! The leftmost element of the tuple is considered the most significant.
1632
+ //! The call operator must not modify members of the key.
1633
+ //!
1634
+ //! @param[in,out] keys
1635
+ //! Keys to sort
1636
+ //!
1637
+ //! @param[in,out] values
1638
+ //! Values to sort
1639
+ //!
1640
+ //! @param decomposer
1641
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1642
+ //! references to its constituent arithmetic types. The leftmost element of
1643
+ //! the tuple is considered the most significant. The call operator must not
1644
+ //! modify members of the key.
1645
+ //!
1646
+ //! @param[in] begin_bit
1647
+ //! The least-significant bit index (inclusive) needed for
1648
+ //! key comparison
1649
+ //!
1650
+ //! @param[in] end_bit
1651
+ //! The most-significant bit index (exclusive) needed for key
1652
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1653
+ template <class DecomposerT>
1654
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1655
+ ::cuda::std::enable_if_t< //
1656
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1657
+ SortBlockedToStriped(
1658
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1659
+ {
1660
+ SortBlockedToStriped(
1661
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1662
+ }
1663
+
1664
+ //! @rst
1665
+ //! Performs an ascending block-wide radix sort over a
1666
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1667
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1668
+ //!
1669
+ //! * @granularity
1670
+ //! * @smemreuse
1671
+ //!
1672
+ //! Snippet
1673
+ //! ==========================================================================
1674
+ //!
1675
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1676
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1677
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1678
+ //! tuple of references to relevant members of the key.
1679
+ //!
1680
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1681
+ //! :language: c++
1682
+ //! :dedent:
1683
+ //! :start-after: example-begin custom-type
1684
+ //! :end-before: example-end custom-type
1685
+ //!
1686
+ //! The code snippet below illustrates a sort of 6 pairs that
1687
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1688
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
1689
+ //!
1690
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1691
+ //! :language: c++
1692
+ //! :dedent:
1693
+ //! :start-after: example-begin pairs-striped
1694
+ //! :end-before: example-end pairs-striped
1695
+ //!
1696
+ //! @endrst
1697
+ //!
1698
+ //! @tparam DecomposerT
1699
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1700
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1701
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1702
+ //! The leftmost element of the tuple is considered the most significant.
1703
+ //! The call operator must not modify members of the key.
1704
+ //!
1705
+ //! @param[in,out] keys
1706
+ //! Keys to sort
1707
+ //!
1708
+ //! @param[in,out] values
1709
+ //! Values to sort
1710
+ //!
1711
+ //! @param decomposer
1712
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1713
+ //! references to its constituent arithmetic types. The leftmost element of
1714
+ //! the tuple is considered the most significant. The call operator must not
1715
+ //! modify members of the key.
1716
+ template <class DecomposerT>
1717
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1718
+ ::cuda::std::enable_if_t< //
1719
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1720
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
1721
+ {
1722
+ SortBlockedToStriped(
1723
+ keys,
1724
+ values,
1725
+ 0,
1726
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1727
+ ::cuda::std::false_type(),
1728
+ detail::bool_constant_v<KEYS_ONLY>,
1729
+ decomposer);
1730
+ }
1731
+
1732
+ //! @rst
1733
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1734
+ //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1735
+ //!
1736
+ //! - @granularity
1737
+ //! - @smemreuse
1738
+ //!
1739
+ //! Snippet
1740
+ //! +++++++
1741
+ //!
1742
+ //! The code snippet below illustrates a sort of 512 integer keys that
1743
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1744
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1745
+ //!
1746
+ //! .. code-block:: c++
1747
+ //!
1748
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1749
+ //!
1750
+ //! __global__ void ExampleKernel(...)
1751
+ //! {
1752
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1753
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1754
+ //!
1755
+ //! // Allocate shared memory for BlockRadixSort
1756
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1757
+ //!
1758
+ //! // Obtain a segment of consecutive items that are blocked across threads
1759
+ //! int thread_keys[4];
1760
+ //! ...
1761
+ //!
1762
+ //! // Collectively sort the keys
1763
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1764
+ //!
1765
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1766
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1767
+ //! The corresponding output ``thread_keys`` in those threads will be
1768
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1769
+ //!
1770
+ //! @endrst
1771
+ //!
1772
+ //! @param[in,out] keys
1773
+ //! Keys to sort
1774
+ //!
1775
+ //! @param[in] begin_bit
1776
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1777
+ //!
1778
+ //! @param[in] end_bit
1779
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1780
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1781
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1782
+ {
1783
+ NullType values[ItemsPerThread];
1784
+
1785
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1786
+ }
1787
+
1788
+ //! @rst
1789
+ //! Performs a descending block-wide radix sort over a
1790
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1791
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1792
+ //!
1793
+ //! * @granularity
1794
+ //! * @smemreuse
1795
+ //!
1796
+ //! Snippet
1797
+ //! ==========================================================================
1798
+ //!
1799
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1800
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1801
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1802
+ //! tuple of references to relevant members of the key.
1803
+ //!
1804
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1805
+ //! :language: c++
1806
+ //! :dedent:
1807
+ //! :start-after: example-begin custom-type
1808
+ //! :end-before: example-end custom-type
1809
+ //!
1810
+ //! The code snippet below illustrates a sort of 4 keys that
1811
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1812
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1813
+ //!
1814
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1815
+ //! :language: c++
1816
+ //! :dedent:
1817
+ //! :start-after: example-begin keys-striped-descending-bits
1818
+ //! :end-before: example-end keys-striped-descending-bits
1819
+ //!
1820
+ //! @endrst
1821
+ //!
1822
+ //! @tparam DecomposerT
1823
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1824
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1825
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1826
+ //! The leftmost element of the tuple is considered the most significant.
1827
+ //! The call operator must not modify members of the key.
1828
+ //!
1829
+ //! @param[in,out] keys
1830
+ //! Keys to sort
1831
+ //!
1832
+ //! @param decomposer
1833
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1834
+ //! references to its constituent arithmetic types. The leftmost element of
1835
+ //! the tuple is considered the most significant. The call operator must not
1836
+ //! modify members of the key.
1837
+ //!
1838
+ //! @param[in] begin_bit
1839
+ //! The least-significant bit index (inclusive) needed for
1840
+ //! key comparison
1841
+ //!
1842
+ //! @param[in] end_bit
1843
+ //! The most-significant bit index (exclusive) needed for key
1844
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1845
+ template <class DecomposerT>
1846
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1847
+ ::cuda::std::enable_if_t< //
1848
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1849
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1850
+ {
1851
+ NullType values[ItemsPerThread];
1852
+
1853
+ SortBlockedToStriped(
1854
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1855
+ }
1856
+
1857
+ //! @rst
1858
+ //! Performs a descending block-wide radix sort over a
1859
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1860
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1861
+ //!
1862
+ //! * @granularity
1863
+ //! * @smemreuse
1864
+ //!
1865
+ //! Snippet
1866
+ //! ==========================================================================
1867
+ //!
1868
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1869
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1870
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1871
+ //! tuple of references to relevant members of the key.
1872
+ //!
1873
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1874
+ //! :language: c++
1875
+ //! :dedent:
1876
+ //! :start-after: example-begin custom-type
1877
+ //! :end-before: example-end custom-type
1878
+ //!
1879
+ //! The code snippet below illustrates a sort of 6 keys that
1880
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1881
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1882
+ //!
1883
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1884
+ //! :language: c++
1885
+ //! :dedent:
1886
+ //! :start-after: example-begin keys-striped-descending
1887
+ //! :end-before: example-end keys-striped-descending
1888
+ //!
1889
+ //! @endrst
1890
+ //!
1891
+ //! @tparam DecomposerT
1892
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1893
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1894
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1895
+ //! The leftmost element of the tuple is considered the most significant.
1896
+ //! The call operator must not modify members of the key.
1897
+ //!
1898
+ //! @param[in,out] keys
1899
+ //! Keys to sort
1900
+ //!
1901
+ //! @param decomposer
1902
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1903
+ //! references to its constituent arithmetic types. The leftmost element of
1904
+ //! the tuple is considered the most significant. The call operator must not
1905
+ //! modify members of the key.
1906
+ template <class DecomposerT>
1907
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1908
+ ::cuda::std::enable_if_t< //
1909
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1910
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1911
+ {
1912
+ NullType values[ItemsPerThread];
1913
+
1914
+ SortBlockedToStriped(
1915
+ keys,
1916
+ values,
1917
+ 0,
1918
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1919
+ ::cuda::std::true_type(),
1920
+ detail::bool_constant_v<KEYS_ONLY>,
1921
+ decomposer);
1922
+ }
1923
+
1924
+ //! @rst
1925
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1926
+ //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
1927
+ //!
1928
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1929
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1930
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1931
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1932
+ //! shared memory.
1933
+ //! - @granularity
1934
+ //! - @smemreuse
1935
+ //!
1936
+ //! Snippet
1937
+ //! +++++++
1938
+ //!
1939
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1940
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1941
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1942
+ //!
1943
+ //! .. code-block:: c++
1944
+ //!
1945
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1946
+ //!
1947
+ //! __global__ void ExampleKernel(...)
1948
+ //! {
1949
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1950
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1951
+ //!
1952
+ //! // Allocate shared memory for BlockRadixSort
1953
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1954
+ //!
1955
+ //! // Obtain a segment of consecutive items that are blocked across threads
1956
+ //! int thread_keys[4];
1957
+ //! int thread_values[4];
1958
+ //! ...
1959
+ //!
1960
+ //! // Collectively sort the keys and values among block threads
1961
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1962
+ //!
1963
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1964
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1965
+ //! The corresponding output ``thread_keys`` in those threads will be
1966
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1967
+ //!
1968
+ //! @endrst
1969
+ //!
1970
+ //! @param[in,out] keys
1971
+ //! Keys to sort
1972
+ //!
1973
+ //! @param[in,out] values
1974
+ //! Values to sort
1975
+ //!
1976
+ //! @param[in] begin_bit
1977
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1978
+ //!
1979
+ //! @param[in] end_bit
1980
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1981
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
1982
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1983
+ {
1984
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1985
+ }
1986
+
1987
+ //! @rst
1988
+ //! Performs a descending block-wide radix sort over a
1989
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1990
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1991
+ //!
1992
+ //! * @granularity
1993
+ //! * @smemreuse
1994
+ //!
1995
+ //! Snippet
1996
+ //! ==========================================================================
1997
+ //!
1998
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1999
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2000
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2001
+ //! tuple of references to relevant members of the key.
2002
+ //!
2003
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2004
+ //! :language: c++
2005
+ //! :dedent:
2006
+ //! :start-after: example-begin custom-type
2007
+ //! :end-before: example-end custom-type
2008
+ //!
2009
+ //! The code snippet below illustrates a sort of 4 keys and values that
2010
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2011
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
2012
+ //!
2013
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2014
+ //! :language: c++
2015
+ //! :dedent:
2016
+ //! :start-after: example-begin pairs-striped-descending-bits
2017
+ //! :end-before: example-end pairs-striped-descending-bits
2018
+ //!
2019
+ //! @endrst
2020
+ //!
2021
+ //! @tparam DecomposerT
2022
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2023
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2024
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2025
+ //! The leftmost element of the tuple is considered the most significant.
2026
+ //! The call operator must not modify members of the key.
2027
+ //!
2028
+ //! @param[in,out] keys
2029
+ //! Keys to sort
2030
+ //!
2031
+ //! @param[in,out] values
2032
+ //! Values to sort
2033
+ //!
2034
+ //! @param decomposer
2035
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2036
+ //! references to its constituent arithmetic types. The leftmost element of
2037
+ //! the tuple is considered the most significant. The call operator must not
2038
+ //! modify members of the key.
2039
+ //!
2040
+ //! @param[in] begin_bit
2041
+ //! The least-significant bit index (inclusive) needed for
2042
+ //! key comparison
2043
+ //!
2044
+ //! @param[in] end_bit
2045
+ //! The most-significant bit index (exclusive) needed for key
2046
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2047
+ template <class DecomposerT>
2048
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2049
+ ::cuda::std::enable_if_t< //
2050
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2051
+ SortDescendingBlockedToStriped(
2052
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
2053
+ {
2054
+ SortBlockedToStriped(
2055
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
2056
+ }
2057
+
2058
+ //! @rst
2059
+ //! Performs a descending block-wide radix sort over a
2060
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2061
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2062
+ //!
2063
+ //! * @granularity
2064
+ //! * @smemreuse
2065
+ //!
2066
+ //! Snippet
2067
+ //! ==========================================================================
2068
+ //!
2069
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2070
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2071
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2072
+ //! tuple of references to relevant members of the key.
2073
+ //!
2074
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2075
+ //! :language: c++
2076
+ //! :dedent:
2077
+ //! :start-after: example-begin custom-type
2078
+ //! :end-before: example-end custom-type
2079
+ //!
2080
+ //! The code snippet below illustrates a sort of 6 keys and values that
2081
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2082
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
2083
+ //!
2084
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2085
+ //! :language: c++
2086
+ //! :dedent:
2087
+ //! :start-after: example-begin pairs-striped-descending
2088
+ //! :end-before: example-end pairs-striped-descending
2089
+ //!
2090
+ //! @endrst
2091
+ //!
2092
+ //! @tparam DecomposerT
2093
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2094
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2095
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2096
+ //! The leftmost element of the tuple is considered the most significant.
2097
+ //! The call operator must not modify members of the key.
2098
+ //!
2099
+ //! @param[in,out] keys
2100
+ //! Keys to sort
2101
+ //!
2102
+ //! @param[in,out] values
2103
+ //! Values to sort
2104
+ //!
2105
+ //! @param decomposer
2106
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2107
+ //! references to its constituent arithmetic types. The leftmost element of
2108
+ //! the tuple is considered the most significant. The call operator must not
2109
+ //! modify members of the key.
2110
+ template <class DecomposerT>
2111
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2112
+ ::cuda::std::enable_if_t< //
2113
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2114
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
2115
+ {
2116
+ SortBlockedToStriped(
2117
+ keys,
2118
+ values,
2119
+ 0,
2120
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
2121
+ ::cuda::std::true_type(),
2122
+ detail::bool_constant_v<KEYS_ONLY>,
2123
+ decomposer);
2124
+ }
2125
+
2126
+ //@} end member group
2127
+ };
2128
+
2129
+ CUB_NAMESPACE_END