cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2196 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ /**
30
+ * @file
31
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
32
+ * sorting of items partitioned across a CUDA thread block.
33
+ */
34
+
35
+ #pragma once
36
+
37
+ #include <cub/config.cuh>
38
+
39
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
40
+ # pragma GCC system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
42
+ # pragma clang system_header
43
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
44
+ # pragma system_header
45
+ #endif // no system header
46
+
47
+ #include <cub/block/block_exchange.cuh>
48
+ #include <cub/block/block_radix_rank.cuh>
49
+ #include <cub/block/radix_rank_sort_operations.cuh>
50
+ #include <cub/util_ptx.cuh>
51
+ #include <cub/util_type.cuh>
52
+
53
+ #include <cuda/std/__algorithm/min.h>
54
+ #include <cuda/std/__type_traits/enable_if.h>
55
+ #include <cuda/std/__type_traits/integral_constant.h>
56
+ #include <cuda/std/__type_traits/is_convertible.h>
57
+ #include <cuda/std/__type_traits/is_same.h>
58
+
59
+ CUB_NAMESPACE_BEGIN
60
+
61
+ //! @rst
62
+ //! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
63
+ //! items partitioned across a CUDA thread block using a radix sorting method.
64
+ //!
65
+ //! .. image:: ../../img/sorting_logo.png
66
+ //! :align: center
67
+ //!
68
+ //! Overview
69
+ //! --------------------------------------------------
70
+ //!
71
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
72
+ //! items into ascending order. It relies upon a positional representation for
73
+ //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
74
+ //! characters, etc.) specified from least-significant to most-significant. For a
75
+ //! given input sequence of keys and a set of rules specifying a total ordering
76
+ //! of the symbolic alphabet, the radix sorting method produces a lexicographic
77
+ //! ordering of those keys.
78
+ //!
79
+ //! @rowmajor
80
+ //!
81
+ //! Supported Types
82
+ //! --------------------------------------------------
83
+ //!
84
+ //! BlockRadixSort can sort all of the built-in C++ numeric primitive types
85
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
86
+ //! half-precision floating-point type. User-defined types are supported as long
87
+ //! as decomposer object is provided.
88
+ //!
89
+ //! Floating-Point Special Cases
90
+ //! --------------------------------------------------
91
+ //!
92
+ //! - Positive and negative zeros are considered equivalent, and will be treated
93
+ //! as such in the output.
94
+ //! - No special handling is implemented for NaN values; these are sorted
95
+ //! according to their bit representations after any transformations.
96
+ //!
97
+ //! Bitwise Key Transformations
98
+ //! --------------------------------------------------
99
+ //!
100
+ //! Although the direct radix sorting method can only be applied to unsigned
101
+ //! integral types, BlockRadixSort is able to sort signed and floating-point
102
+ //! types via simple bit-wise transformations that ensure lexicographic key
103
+ //! ordering.
104
+ //!
105
+ //! These transformations must be considered when restricting the
106
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
107
+ //! before the bit-range truncation.
108
+ //!
109
+ //! Any transformations applied to the keys prior to sorting are reversed
110
+ //! while writing to the final output buffer.
111
+ //!
112
+ //! Type Specific Bitwise Transformations
113
+ //! --------------------------------------------------
114
+ //!
115
+ //! To convert the input values into a radix-sortable bitwise representation,
116
+ //! the following transformations take place prior to sorting:
117
+ //!
118
+ //! * For unsigned integral values, the keys are used directly.
119
+ //! * For signed integral values, the sign bit is inverted.
120
+ //! * For positive floating point values, the sign bit is inverted.
121
+ //! * For negative floating point values, the full key is inverted.
122
+ //!
123
+ //! No Descending Sort Transformations
124
+ //! --------------------------------------------------
125
+ //!
126
+ //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
127
+ //! when performing a descending sort. Instead, it has special logic to reverse
128
+ //! the order of the keys while sorting.
129
+ //!
130
+ //! Stability
131
+ //! --------------------------------------------------
132
+ //!
133
+ //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
134
+ //! are considered equal and appear in the result in the same order as they
135
+ //! appear in the input.
136
+ //!
137
+ //!
138
+ //! Performance Considerations
139
+ //! --------------------------------------------------
140
+ //!
141
+ //! * @granularity
142
+ //!
143
+ //! A Simple Example
144
+ //! --------------------------------------------------
145
+ //!
146
+ //! @blockcollective{BlockRadixSort}
147
+ //!
148
+ //! The code snippet below illustrates a sort of 512 integer keys that
149
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
150
+ //! where each thread owns 4 consecutive items.
151
+ //!
152
+ //! .. tab-set-code::
153
+ //!
154
+ //! .. code-block:: c++
155
+ //!
156
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
157
+ //!
158
+ //! __global__ void kernel(...)
159
+ //! {
160
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
161
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
162
+ //!
163
+ //! // Allocate shared memory for BlockRadixSort
164
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
165
+ //!
166
+ //! // Obtain a segment of consecutive items that are blocked across threads
167
+ //! int thread_keys[4];
168
+ //! ...
169
+ //!
170
+ //! // Collectively sort the keys
171
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
172
+ //!
173
+ //! ...
174
+ //!
175
+ //! .. code-block:: python
176
+ //!
177
+ //! from cuda import coop
178
+ //! from pynvjitlink import patch
179
+ //! patch.patch_numba_linker(lto=True)
180
+ //!
181
+ //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
182
+ //! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
183
+ //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
184
+ //!
185
+ //! @cuda.jit(link=block_radix_sort.files)
186
+ //! def kernel():
187
+ //! Allocate shared memory for radix sort
188
+ //! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
189
+ //!
190
+ //! # Obtain a segment of consecutive items that are blocked across threads
191
+ //! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
192
+ //! # ...
193
+ //!
194
+ //! // Collectively sort the keys
195
+ //! block_radix_sort(temp_storage, thread_keys)
196
+ //! # ...
197
+ //!
198
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
199
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
200
+ //! The corresponding output ``thread_keys`` in those threads will be
201
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
202
+ //!
203
+ //! Re-using dynamically allocating shared memory
204
+ //! --------------------------------------------------
205
+ //!
206
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
207
+ //! BlockReduce and how to re-purpose the same memory region.
208
+ //!
209
+ //! This example can be easily adapted to the storage required by BlockRadixSort.
210
+ //! @endrst
211
+ //!
212
+ //! @tparam KeyT
213
+ //! KeyT type
214
+ //!
215
+ //! @tparam BLOCK_DIM_X
216
+ //! The thread block length in threads along the X dimension
217
+ //!
218
+ //! @tparam ITEMS_PER_THREAD
219
+ //! The number of items per thread
220
+ //!
221
+ //! @tparam ValueT
222
+ //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
223
+ //!
224
+ //! @tparam RADIX_BITS
225
+ //! **[optional]** The number of radix bits per digit place (default: 4 bits)
226
+ //!
227
+ //! @tparam MEMOIZE_OUTER_SCAN
228
+ //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
229
+ //! reads at the expense of higher register pressure (default: true for architectures SM35 and
230
+ //! newer, false otherwise).
231
+ //!
232
+ //! @tparam INNER_SCAN_ALGORITHM
233
+ //! **[optional]** The cub::BlockScanAlgorithm algorithm to use
234
+ //! (default: cub::BLOCK_SCAN_WARP_SCANS)
235
+ //!
236
+ //! @tparam SMEM_CONFIG
237
+ //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
238
+ //!
239
+ //! @tparam BLOCK_DIM_Y
240
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
241
+ //!
242
+ //! @tparam BLOCK_DIM_Z
243
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
244
+ //!
245
+ template <typename KeyT,
246
+ int BLOCK_DIM_X,
247
+ int ITEMS_PER_THREAD,
248
+ typename ValueT = NullType,
249
+ int RADIX_BITS = 4,
250
+ bool MEMOIZE_OUTER_SCAN = true,
251
+ BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
252
+ cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
253
+ int BLOCK_DIM_Y = 1,
254
+ int BLOCK_DIM_Z = 1>
255
+ class BlockRadixSort
256
+ {
257
+ private:
258
+ /******************************************************************************
259
+ * Constants and type definitions
260
+ ******************************************************************************/
261
+
262
+ enum
263
+ {
264
+ // The thread block size in threads
265
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
266
+
267
+ // Whether or not there are values to be trucked along with keys
268
+ KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>,
269
+ };
270
+
271
+ // KeyT traits and unsigned bits type
272
+ using traits = detail::radix::traits_t<KeyT>;
273
+ using bit_ordered_type = typename traits::bit_ordered_type;
274
+ using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
275
+
276
+ /// Ascending BlockRadixRank utility type
277
+ using AscendingBlockRadixRank =
278
+ BlockRadixRank<BLOCK_DIM_X,
279
+ RADIX_BITS,
280
+ false,
281
+ MEMOIZE_OUTER_SCAN,
282
+ INNER_SCAN_ALGORITHM,
283
+ SMEM_CONFIG,
284
+ BLOCK_DIM_Y,
285
+ BLOCK_DIM_Z>;
286
+
287
+ /// Descending BlockRadixRank utility type
288
+ using DescendingBlockRadixRank =
289
+ BlockRadixRank<BLOCK_DIM_X,
290
+ RADIX_BITS,
291
+ true,
292
+ MEMOIZE_OUTER_SCAN,
293
+ INNER_SCAN_ALGORITHM,
294
+ SMEM_CONFIG,
295
+ BLOCK_DIM_Y,
296
+ BLOCK_DIM_Z>;
297
+
298
+ /// Digit extractor type
299
+ using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
300
+
301
+ /// BlockExchange utility type for keys
302
+ using BlockExchangeKeys = BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
303
+
304
+ /// BlockExchange utility type for values
305
+ using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
306
+
307
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
308
+ /// Shared memory storage layout type
309
+ union _TempStorage
310
+ {
311
+ typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
312
+ typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
313
+ typename BlockExchangeKeys::TempStorage exchange_keys;
314
+ typename BlockExchangeValues::TempStorage exchange_values;
315
+ };
316
+ #endif // _CCCL_DOXYGEN_INVOKED
317
+
318
+ /******************************************************************************
319
+ * Thread fields
320
+ ******************************************************************************/
321
+
322
+ /// Shared storage reference
323
+ _TempStorage& temp_storage;
324
+
325
+ /// Linear thread-id
326
+ unsigned int linear_tid;
327
+
328
+ /******************************************************************************
329
+ * Utility methods
330
+ ******************************************************************************/
331
+
332
+ /// Internal storage allocator
333
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
334
+ {
335
+ __shared__ _TempStorage private_storage;
336
+ return private_storage;
337
+ }
338
+
339
+ /// Rank keys (specialized for ascending sort)
340
+ template <class DigitExtractorT>
341
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
342
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
343
+ int (&ranks)[ITEMS_PER_THREAD],
344
+ DigitExtractorT digit_extractor,
345
+ ::cuda::std::false_type /*is_descending*/)
346
+ {
347
+ AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
348
+ }
349
+
350
+ /// Rank keys (specialized for descending sort)
351
+ template <class DigitExtractorT>
352
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
353
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
354
+ int (&ranks)[ITEMS_PER_THREAD],
355
+ DigitExtractorT digit_extractor,
356
+ ::cuda::std::true_type /*is_descending*/)
357
+ {
358
+ DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
359
+ }
360
+
361
+ /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
362
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
363
+ ValueT (&values)[ITEMS_PER_THREAD],
364
+ int (&ranks)[ITEMS_PER_THREAD],
365
+ ::cuda::std::false_type /*is_keys_only*/,
366
+ ::cuda::std::true_type /*is_blocked*/)
367
+ {
368
+ __syncthreads();
369
+
370
+ // Exchange values through shared memory in blocked arrangement
371
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
372
+ }
373
+
374
+ /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
375
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
376
+ ValueT (&values)[ITEMS_PER_THREAD],
377
+ int (&ranks)[ITEMS_PER_THREAD],
378
+ ::cuda::std::false_type /*is_keys_only*/,
379
+ ::cuda::std::false_type /*is_blocked*/)
380
+ {
381
+ __syncthreads();
382
+
383
+ // Exchange values through shared memory in blocked arrangement
384
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
385
+ }
386
+
387
+ /// ExchangeValues (specialized for keys-only sort)
388
+ template <bool IS_BLOCKED>
389
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
390
+ ValueT (& /*values*/)[ITEMS_PER_THREAD],
391
+ int (& /*ranks*/)[ITEMS_PER_THREAD],
392
+ ::cuda::std::true_type /*is_keys_only*/,
393
+ ::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
394
+ {}
395
+
396
+ /**
397
+ * @brief Sort blocked arrangement
398
+ *
399
+ * @param keys
400
+ * Keys to sort
401
+ *
402
+ * @param values
403
+ * Values to sort
404
+ *
405
+ * @param begin_bit
406
+ * The beginning (least-significant) bit index needed for key comparison
407
+ *
408
+ * @param end_bit
409
+ * The past-the-end (most-significant) bit index needed for key comparison
410
+ *
411
+ * @param is_descending
412
+ * Tag whether is a descending-order sort
413
+ *
414
+ * @param is_keys_only
415
+ * Tag whether is keys-only sort
416
+ */
417
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
418
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
419
+ KeyT (&keys)[ITEMS_PER_THREAD],
420
+ ValueT (&values)[ITEMS_PER_THREAD],
421
+ int begin_bit,
422
+ int end_bit,
423
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
424
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
425
+ DecomposerT decomposer = {})
426
+ {
427
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
428
+
429
+ _CCCL_PRAGMA_UNROLL_FULL()
430
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
431
+ {
432
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
433
+ }
434
+
435
+ // Radix sorting passes
436
+ while (true)
437
+ {
438
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
439
+ auto digit_extractor =
440
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
441
+
442
+ // Rank the blocked keys
443
+ int ranks[ITEMS_PER_THREAD];
444
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
445
+ begin_bit += RADIX_BITS;
446
+
447
+ __syncthreads();
448
+
449
+ // Exchange keys through shared memory in blocked arrangement
450
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
451
+
452
+ // Exchange values through shared memory in blocked arrangement
453
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
454
+
455
+ // Quit if done
456
+ if (begin_bit >= end_bit)
457
+ {
458
+ break;
459
+ }
460
+
461
+ __syncthreads();
462
+ }
463
+
464
+ // Untwiddle bits if necessary
465
+ _CCCL_PRAGMA_UNROLL_FULL()
466
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
467
+ {
468
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
469
+ }
470
+ }
471
+
472
+ public:
473
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
474
+
475
+ /**
476
+ * @brief Sort blocked -> striped arrangement
477
+ *
478
+ * @param keys
479
+ * Keys to sort
480
+ *
481
+ * @param values
482
+ * Values to sort
483
+ *
484
+ * @param begin_bit
485
+ * The beginning (least-significant) bit index needed for key comparison
486
+ *
487
+ * @param end_bit
488
+ * The past-the-end (most-significant) bit index needed for key comparison
489
+ *
490
+ * @param is_descending
491
+ * Tag whether is a descending-order sort
492
+ *
493
+ * @param is_keys_only
494
+ * Tag whether is keys-only sort
495
+ */
496
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
497
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
498
+ KeyT (&keys)[ITEMS_PER_THREAD],
499
+ ValueT (&values)[ITEMS_PER_THREAD],
500
+ int begin_bit,
501
+ int end_bit,
502
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
503
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
504
+ DecomposerT decomposer = {})
505
+ {
506
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
507
+
508
+ _CCCL_PRAGMA_UNROLL_FULL()
509
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
510
+ {
511
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
512
+ }
513
+
514
+ // Radix sorting passes
515
+ while (true)
516
+ {
517
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
518
+ auto digit_extractor =
519
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
520
+
521
+ // Rank the blocked keys
522
+ int ranks[ITEMS_PER_THREAD];
523
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
524
+ begin_bit += RADIX_BITS;
525
+
526
+ __syncthreads();
527
+
528
+ // Check if this is the last pass
529
+ if (begin_bit >= end_bit)
530
+ {
531
+ // Last pass exchanges keys through shared memory in striped arrangement
532
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
533
+
534
+ // Last pass exchanges through shared memory in striped arrangement
535
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::false_type());
536
+
537
+ // Quit
538
+ break;
539
+ }
540
+
541
+ // Exchange keys through shared memory in blocked arrangement
542
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
543
+
544
+ // Exchange values through shared memory in blocked arrangement
545
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
546
+
547
+ __syncthreads();
548
+ }
549
+
550
+ // Untwiddle bits if necessary
551
+ _CCCL_PRAGMA_UNROLL_FULL()
552
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
553
+ {
554
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
555
+ }
556
+ }
557
+
558
+ #endif // _CCCL_DOXYGEN_INVOKED
559
+
560
+ /// @smemstorage{BlockRadixSort}
561
+ struct TempStorage : Uninitialized<_TempStorage>
562
+ {};
563
+
564
+ //! @name Collective constructors
565
+ //! @{
566
+
567
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
568
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
569
+ : temp_storage(PrivateStorage())
570
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
571
+ {}
572
+
573
+ /**
574
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
575
+ *
576
+ * @param[in] temp_storage
577
+ * Reference to memory allocation having layout type TempStorage
578
+ */
579
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
580
+ : temp_storage(temp_storage.Alias())
581
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
582
+ {}
583
+
584
+ //! @} end member group
585
+ //! @name Sorting (blocked arrangements)
586
+ //! @{
587
+
588
+ //! @rst
589
+ //! Performs an ascending block-wide radix sort over a
590
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
591
+ //!
592
+ //! - @granularity
593
+ //! - @smemreuse
594
+ //!
595
+ //! Snippet
596
+ //! +++++++
597
+ //!
598
+ //! The code snippet below illustrates a sort of 512 integer keys that
599
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
600
+ //! where each thread owns 4 consecutive keys.
601
+ //!
602
+ //! .. code-block:: c++
603
+ //!
604
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
605
+ //!
606
+ //! __global__ void ExampleKernel(...)
607
+ //! {
608
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
609
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
610
+ //!
611
+ //! // Allocate shared memory for BlockRadixSort
612
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
613
+ //!
614
+ //! // Obtain a segment of consecutive items that are blocked across threads
615
+ //! int thread_keys[4];
616
+ //! ...
617
+ //!
618
+ //! // Collectively sort the keys
619
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
620
+ //!
621
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
622
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
623
+ //! The corresponding output ``thread_keys`` in those threads will be
624
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
625
+ //! @endrst
626
+ //!
627
+ //! @param[in,out] keys
628
+ //! Keys to sort
629
+ //!
630
+ //! @param[in] begin_bit
631
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
632
+ //!
633
+ //! @param[in] end_bit
634
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
635
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
636
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
637
+ {
638
+ NullType values[ITEMS_PER_THREAD];
639
+
640
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
641
+ }
642
+
643
+ //! @rst
644
+ //! Performs an ascending block-wide radix sort over a
645
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
646
+ //!
647
+ //! * @granularity
648
+ //! * @smemreuse
649
+ //!
650
+ //! Snippet
651
+ //! ==========================================================================
652
+ //!
653
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
654
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
655
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
656
+ //! tuple of references to relevant members of the key.
657
+ //!
658
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
659
+ //! :language: c++
660
+ //! :dedent:
661
+ //! :start-after: example-begin custom-type
662
+ //! :end-before: example-end custom-type
663
+ //!
664
+ //! The code snippet below illustrates a sort of 2 keys that
665
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
666
+ //! where each thread owns 1 key.
667
+ //!
668
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
669
+ //! :language: c++
670
+ //! :dedent:
671
+ //! :start-after: example-begin keys-bits
672
+ //! :end-before: example-end keys-bits
673
+ //!
674
+ //! @endrst
675
+ //!
676
+ //! @tparam DecomposerT
677
+ //! **[inferred]** Type of a callable object responsible for decomposing a
678
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
679
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
680
+ //! The leftmost element of the tuple is considered the most significant.
681
+ //! The call operator must not modify members of the key.
682
+ //!
683
+ //! @param[in,out] keys
684
+ //! Keys to sort
685
+ //!
686
+ //! @param decomposer
687
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
688
+ //! references to its constituent arithmetic types. The leftmost element of
689
+ //! the tuple is considered the most significant. The call operator must not
690
+ //! modify members of the key.
691
+ //!
692
+ //! @param[in] begin_bit
693
+ //! The least-significant bit index (inclusive) needed for
694
+ //! key comparison
695
+ //!
696
+ //! @param[in] end_bit
697
+ //! The most-significant bit index (exclusive) needed for key
698
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
699
+ template <class DecomposerT>
700
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
701
+ ::cuda::std::enable_if_t< //
702
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
703
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
704
+ {
705
+ NullType values[ITEMS_PER_THREAD];
706
+
707
+ SortBlocked(
708
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
709
+ }
710
+
711
+ //! @rst
712
+ //! Performs an ascending block-wide radix sort over a
713
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
714
+ //!
715
+ //! * @granularity
716
+ //! * @smemreuse
717
+ //!
718
+ //! Snippet
719
+ //! ==========================================================================
720
+ //!
721
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
722
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
723
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
724
+ //! tuple of references to relevant members of the key.
725
+ //!
726
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
727
+ //! :language: c++
728
+ //! :dedent:
729
+ //! :start-after: example-begin custom-type
730
+ //! :end-before: example-end custom-type
731
+ //!
732
+ //! The code snippet below illustrates a sort of 6 keys that
733
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
734
+ //! where each thread owns 3 consecutive keys.
735
+ //!
736
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
737
+ //! :language: c++
738
+ //! :dedent:
739
+ //! :start-after: example-begin keys
740
+ //! :end-before: example-end keys
741
+ //!
742
+ //! @endrst
743
+ //!
744
+ //! @tparam DecomposerT
745
+ //! **[inferred]** Type of a callable object responsible for decomposing a
746
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
747
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
748
+ //! The leftmost element of the tuple is considered the most significant.
749
+ //! The call operator must not modify members of the key.
750
+ //!
751
+ //! @param[in,out] keys
752
+ //! Keys to sort
753
+ //!
754
+ //! @param decomposer
755
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
756
+ //! references to its constituent arithmetic types. The leftmost element of
757
+ //! the tuple is considered the most significant. The call operator must not
758
+ //! modify members of the key.
759
+ template <class DecomposerT>
760
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
761
+ ::cuda::std::enable_if_t< //
762
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
763
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
764
+ {
765
+ Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
766
+ }
767
+
768
+ //! @rst
769
+ //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
770
+ //! of keys and values.
771
+ //!
772
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
773
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
774
+ //! with a temporary value array that enumerates the key indices. The reordered indices
775
+ //! can then be used as a gather-vector for exchanging other associated tile data through
776
+ //! shared memory.
777
+ //! - @granularity
778
+ //! - @smemreuse
779
+ //!
780
+ //! Snippet
781
+ //! +++++++
782
+ //!
783
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
784
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
785
+ //! where each thread owns 4 consecutive pairs.
786
+ //!
787
+ //! .. code-block:: c++
788
+ //!
789
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
790
+ //!
791
+ //! __global__ void ExampleKernel(...)
792
+ //! {
793
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
794
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
795
+ //!
796
+ //! // Allocate shared memory for BlockRadixSort
797
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
798
+ //!
799
+ //! // Obtain a segment of consecutive items that are blocked across threads
800
+ //! int thread_keys[4];
801
+ //! int thread_values[4];
802
+ //! ...
803
+ //!
804
+ //! // Collectively sort the keys and values among block threads
805
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
806
+ //!
807
+ //! @endcode
808
+ //! @par
809
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
810
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
811
+ //! corresponding output ``thread_keys`` in those threads will be
812
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
813
+ //!
814
+ //! @endrst
815
+ //!
816
+ //! @param[in,out] keys
817
+ //! Keys to sort
818
+ //!
819
+ //! @param[in,out] values
820
+ //! Values to sort
821
+ //!
822
+ //! @param[in] begin_bit
823
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
824
+ //!
825
+ //! @param[in] end_bit
826
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
827
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
828
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
829
+ ValueT (&values)[ITEMS_PER_THREAD],
830
+ int begin_bit = 0,
831
+ int end_bit = sizeof(KeyT) * 8)
832
+ {
833
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
834
+ }
835
+
836
+ //! @rst
837
+ //! Performs an ascending block-wide radix sort over a
838
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
839
+ //!
840
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
841
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
842
+ //! with a temporary value array that enumerates the key indices. The reordered indices
843
+ //! can then be used as a gather-vector for exchanging other associated tile data through
844
+ //! shared memory.
845
+ //! * @granularity
846
+ //! * @smemreuse
847
+ //!
848
+ //! Snippet
849
+ //! ==========================================================================
850
+ //!
851
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
852
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
853
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
854
+ //! tuple of references to relevant members of the key.
855
+ //!
856
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
857
+ //! :language: c++
858
+ //! :dedent:
859
+ //! :start-after: example-begin custom-type
860
+ //! :end-before: example-end custom-type
861
+ //!
862
+ //! The code snippet below illustrates a sort of 2 keys and values that
863
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
864
+ //! where each thread owns 1 pair.
865
+ //!
866
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
867
+ //! :language: c++
868
+ //! :dedent:
869
+ //! :start-after: example-begin pairs-bits
870
+ //! :end-before: example-end pairs-bits
871
+ //!
872
+ //! @endrst
873
+ //!
874
+ //! @tparam DecomposerT
875
+ //! **[inferred]** Type of a callable object responsible for decomposing a
876
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
877
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
878
+ //! The leftmost element of the tuple is considered the most significant.
879
+ //! The call operator must not modify members of the key.
880
+ //!
881
+ //! @param[in,out] keys
882
+ //! Keys to sort
883
+ //!
884
+ //! @param[in,out] values
885
+ //! Values to sort
886
+ //!
887
+ //! @param decomposer
888
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
889
+ //! references to its constituent arithmetic types. The leftmost element of
890
+ //! the tuple is considered the most significant. The call operator must not
891
+ //! modify members of the key.
892
+ //!
893
+ //! @param[in] begin_bit
894
+ //! The least-significant bit index (inclusive) needed for
895
+ //! key comparison
896
+ //!
897
+ //! @param[in] end_bit
898
+ //! The most-significant bit index (exclusive) needed for key
899
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
900
+ template <class DecomposerT>
901
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
902
+ ::cuda::std::enable_if_t< //
903
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
904
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
905
+ ValueT (&values)[ITEMS_PER_THREAD],
906
+ DecomposerT decomposer,
907
+ int begin_bit,
908
+ int end_bit)
909
+ {
910
+ SortBlocked(
911
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
912
+ }
913
+
914
+ //! @rst
915
+ //! Performs an ascending block-wide radix sort over a
916
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
917
+ //!
918
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
919
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
920
+ //! with a temporary value array that enumerates the key indices. The reordered indices
921
+ //! can then be used as a gather-vector for exchanging other associated tile data through
922
+ //! shared memory.
923
+ //! * @granularity
924
+ //! * @smemreuse
925
+ //!
926
+ //! Snippet
927
+ //! ==========================================================================
928
+ //!
929
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
930
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
931
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
932
+ //! tuple of references to relevant members of the key.
933
+ //!
934
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
935
+ //! :language: c++
936
+ //! :dedent:
937
+ //! :start-after: example-begin custom-type
938
+ //! :end-before: example-end custom-type
939
+ //!
940
+ //! The code snippet below illustrates a sort of 6 keys and values that
941
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
942
+ //! where each thread owns 3 consecutive pairs.
943
+ //!
944
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
945
+ //! :language: c++
946
+ //! :dedent:
947
+ //! :start-after: example-begin pairs
948
+ //! :end-before: example-end pairs
949
+ //!
950
+ //! @endrst
951
+ //!
952
+ //! @tparam DecomposerT
953
+ //! **[inferred]** Type of a callable object responsible for decomposing a
954
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
955
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
956
+ //! The leftmost element of the tuple is considered the most significant.
957
+ //! The call operator must not modify members of the key.
958
+ //!
959
+ //! @param[in,out] keys
960
+ //! Keys to sort
961
+ //!
962
+ //! @param[in,out] values
963
+ //! Values to sort
964
+ //!
965
+ //! @param decomposer
966
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
967
+ //! references to its constituent arithmetic types. The leftmost element of
968
+ //! the tuple is considered the most significant. The call operator must not
969
+ //! modify members of the key.
970
+ template <class DecomposerT>
971
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
972
+ ::cuda::std::enable_if_t< //
973
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
974
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
975
+ {
976
+ Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
977
+ }
978
+
979
+ //! @rst
980
+ //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
981
+ //! of keys.
982
+ //!
983
+ //! - @granularity
984
+ //! - @smemreuse
985
+ //!
986
+ //! Snippet
987
+ //! +++++++
988
+ //!
989
+ //! The code snippet below illustrates a sort of 512 integer keys that
990
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
991
+ //! where each thread owns 4 consecutive keys.
992
+ //!
993
+ //! .. code-block:: c++
994
+ //!
995
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
996
+ //!
997
+ //! __global__ void ExampleKernel(...)
998
+ //! {
999
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1000
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1001
+ //!
1002
+ //! // Allocate shared memory for BlockRadixSort
1003
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1004
+ //!
1005
+ //! // Obtain a segment of consecutive items that are blocked across threads
1006
+ //! int thread_keys[4];
1007
+ //! ...
1008
+ //!
1009
+ //! // Collectively sort the keys
1010
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
1011
+ //!
1012
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1013
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1014
+ //! The corresponding output ``thread_keys`` in those threads will be
1015
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1016
+ //!
1017
+ //! @endrst
1018
+ //!
1019
+ //! @param[in,out] keys
1020
+ //! Keys to sort
1021
+ //!
1022
+ //! @param[in] begin_bit
1023
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1024
+ //!
1025
+ //! @param[in] end_bit
1026
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1027
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1028
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1029
+ {
1030
+ NullType values[ITEMS_PER_THREAD];
1031
+
1032
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1033
+ }
1034
+
1035
+ //! @rst
1036
+ //! Performs a descending block-wide radix sort over a
1037
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1038
+ //!
1039
+ //! * @granularity
1040
+ //! * @smemreuse
1041
+ //!
1042
+ //! Snippet
1043
+ //! ==========================================================================
1044
+ //!
1045
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1046
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1047
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1048
+ //! tuple of references to relevant members of the key.
1049
+ //!
1050
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1051
+ //! :language: c++
1052
+ //! :dedent:
1053
+ //! :start-after: example-begin custom-type
1054
+ //! :end-before: example-end custom-type
1055
+ //!
1056
+ //! The code snippet below illustrates a sort of 2 keys that
1057
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1058
+ //! where each thread owns 1 key.
1059
+ //!
1060
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1061
+ //! :language: c++
1062
+ //! :dedent:
1063
+ //! :start-after: example-begin keys-descending-bits
1064
+ //! :end-before: example-end keys-descending-bits
1065
+ //!
1066
+ //! @endrst
1067
+ //!
1068
+ //! @tparam DecomposerT
1069
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1070
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1071
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1072
+ //! The leftmost element of the tuple is considered the most significant.
1073
+ //! The call operator must not modify members of the key.
1074
+ //!
1075
+ //! @param[in,out] keys
1076
+ //! Keys to sort
1077
+ //!
1078
+ //! @param decomposer
1079
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1080
+ //! references to its constituent arithmetic types. The leftmost element of
1081
+ //! the tuple is considered the most significant. The call operator must not
1082
+ //! modify members of the key.
1083
+ //!
1084
+ //! @param[in] begin_bit
1085
+ //! The least-significant bit index (inclusive) needed for
1086
+ //! key comparison
1087
+ //!
1088
+ //! @param[in] end_bit
1089
+ //! The most-significant bit index (exclusive) needed for key
1090
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1091
+ template <class DecomposerT>
1092
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1093
+ ::cuda::std::enable_if_t< //
1094
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1095
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1096
+ {
1097
+ NullType values[ITEMS_PER_THREAD];
1098
+
1099
+ SortBlocked(
1100
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1101
+ }
1102
+
1103
+ //! @rst
1104
+ //! Performs a descending block-wide radix sort over a
1105
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1106
+ //!
1107
+ //! * @granularity
1108
+ //! * @smemreuse
1109
+ //!
1110
+ //! Snippet
1111
+ //! ==========================================================================
1112
+ //!
1113
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1114
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1115
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1116
+ //! tuple of references to relevant members of the key.
1117
+ //!
1118
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1119
+ //! :language: c++
1120
+ //! :dedent:
1121
+ //! :start-after: example-begin custom-type
1122
+ //! :end-before: example-end custom-type
1123
+ //!
1124
+ //! The code snippet below illustrates a sort of 6 keys that
1125
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1126
+ //! where each thread owns 3 consecutive keys.
1127
+ //!
1128
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1129
+ //! :language: c++
1130
+ //! :dedent:
1131
+ //! :start-after: example-begin keys-descending
1132
+ //! :end-before: example-end keys-descending
1133
+ //!
1134
+ //! @endrst
1135
+ //!
1136
+ //! @tparam DecomposerT
1137
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1138
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1139
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1140
+ //! The leftmost element of the tuple is considered the most significant.
1141
+ //! The call operator must not modify members of the key.
1142
+ //!
1143
+ //! @param[in,out] keys
1144
+ //! Keys to sort
1145
+ //!
1146
+ //! @param decomposer
1147
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1148
+ //! references to its constituent arithmetic types. The leftmost element of
1149
+ //! the tuple is considered the most significant. The call operator must not
1150
+ //! modify members of the key.
1151
+ template <class DecomposerT>
1152
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1153
+ ::cuda::std::enable_if_t< //
1154
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1155
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1156
+ {
1157
+ NullType values[ITEMS_PER_THREAD];
1158
+
1159
+ SortBlocked(
1160
+ keys,
1161
+ values,
1162
+ 0,
1163
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1164
+ ::cuda::std::true_type(),
1165
+ detail::bool_constant_v<KEYS_ONLY>,
1166
+ decomposer);
1167
+ }
1168
+
1169
+ //! @rst
1170
+ //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1171
+ //! of keys and values.
1172
+ //!
1173
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1174
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1175
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1176
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1177
+ //! shared memory.
1178
+ //! - @granularity
1179
+ //! - @smemreuse
1180
+ //!
1181
+ //! Snippet
1182
+ //! +++++++
1183
+ //!
1184
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1185
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1186
+ //! where each thread owns 4 consecutive pairs.
1187
+ //!
1188
+ //! .. code-block:: c++
1189
+ //!
1190
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1191
+ //!
1192
+ //! __global__ void ExampleKernel(...)
1193
+ //! {
1194
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1195
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1196
+ //!
1197
+ //! // Allocate shared memory for BlockRadixSort
1198
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1199
+ //!
1200
+ //! // Obtain a segment of consecutive items that are blocked across threads
1201
+ //! int thread_keys[4];
1202
+ //! int thread_values[4];
1203
+ //! ...
1204
+ //!
1205
+ //! // Collectively sort the keys and values among block threads
1206
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
1207
+ //!
1208
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1209
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
1210
+ //! corresponding output ``thread_keys`` in those threads will be
1211
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1212
+ //!
1213
+ //! @endrst
1214
+ //!
1215
+ //! @param[in,out] keys
1216
+ //! Keys to sort
1217
+ //!
1218
+ //! @param[in,out] values
1219
+ //! Values to sort
1220
+ //!
1221
+ //! @param[in] begin_bit
1222
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1223
+ //!
1224
+ //! @param[in] end_bit
1225
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1226
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
1227
+ KeyT (&keys)[ITEMS_PER_THREAD],
1228
+ ValueT (&values)[ITEMS_PER_THREAD],
1229
+ int begin_bit = 0,
1230
+ int end_bit = sizeof(KeyT) * 8)
1231
+ {
1232
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1233
+ }
1234
+
1235
+ //! @rst
1236
+ //! Performs a descending block-wide radix sort over a
1237
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1238
+ //!
1239
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1240
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1241
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1242
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1243
+ //! shared memory.
1244
+ //! * @granularity
1245
+ //! * @smemreuse
1246
+ //!
1247
+ //! Snippet
1248
+ //! ==========================================================================
1249
+ //!
1250
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1251
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1252
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1253
+ //! tuple of references to relevant members of the key.
1254
+ //!
1255
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1256
+ //! :language: c++
1257
+ //! :dedent:
1258
+ //! :start-after: example-begin custom-type
1259
+ //! :end-before: example-end custom-type
1260
+ //!
1261
+ //! The code snippet below illustrates a sort of 2 pairs that
1262
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1263
+ //! where each thread owns 1 pair.
1264
+ //!
1265
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1266
+ //! :language: c++
1267
+ //! :dedent:
1268
+ //! :start-after: example-begin pairs-descending-bits
1269
+ //! :end-before: example-end pairs-descending-bits
1270
+ //!
1271
+ //! @endrst
1272
+ //!
1273
+ //! @tparam DecomposerT
1274
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1275
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1276
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1277
+ //! The leftmost element of the tuple is considered the most significant.
1278
+ //! The call operator must not modify members of the key.
1279
+ //!
1280
+ //! @param[in,out] keys
1281
+ //! Keys to sort
1282
+ //!
1283
+ //! @param[in,out] values
1284
+ //! Values to sort
1285
+ //!
1286
+ //! @param decomposer
1287
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1288
+ //! references to its constituent arithmetic types. The leftmost element of
1289
+ //! the tuple is considered the most significant. The call operator must not
1290
+ //! modify members of the key.
1291
+ //!
1292
+ //! @param[in] begin_bit
1293
+ //! The least-significant bit index (inclusive) needed for
1294
+ //! key comparison
1295
+ //!
1296
+ //! @param[in] end_bit
1297
+ //! The most-significant bit index (exclusive) needed for key
1298
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1299
+ template <class DecomposerT>
1300
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1301
+ ::cuda::std::enable_if_t< //
1302
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1303
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
1304
+ ValueT (&values)[ITEMS_PER_THREAD],
1305
+ DecomposerT decomposer,
1306
+ int begin_bit,
1307
+ int end_bit)
1308
+ {
1309
+ SortBlocked(
1310
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1311
+ }
1312
+
1313
+ //! @rst
1314
+ //! Performs a descending block-wide radix sort over a
1315
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1316
+ //!
1317
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1318
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1319
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1320
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1321
+ //! shared memory.
1322
+ //! * @granularity
1323
+ //! * @smemreuse
1324
+ //!
1325
+ //! Snippet
1326
+ //! ==========================================================================
1327
+ //!
1328
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1329
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1330
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1331
+ //! tuple of references to relevant members of the key.
1332
+ //!
1333
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1334
+ //! :language: c++
1335
+ //! :dedent:
1336
+ //! :start-after: example-begin custom-type
1337
+ //! :end-before: example-end custom-type
1338
+ //!
1339
+ //! The code snippet below illustrates a sort of 6 keys and values that
1340
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1341
+ //! where each thread owns 3 consecutive pairs.
1342
+ //!
1343
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1344
+ //! :language: c++
1345
+ //! :dedent:
1346
+ //! :start-after: example-begin pairs-descending
1347
+ //! :end-before: example-end pairs-descending
1348
+ //!
1349
+ //! @endrst
1350
+ //!
1351
+ //! @tparam DecomposerT
1352
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1353
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1354
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1355
+ //! The leftmost element of the tuple is considered the most significant.
1356
+ //! The call operator must not modify members of the key.
1357
+ //!
1358
+ //! @param[in,out] keys
1359
+ //! Keys to sort
1360
+ //!
1361
+ //! @param[in,out] values
1362
+ //! Values to sort
1363
+ //!
1364
+ //! @param decomposer
1365
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1366
+ //! references to its constituent arithmetic types. The leftmost element of
1367
+ //! the tuple is considered the most significant. The call operator must not
1368
+ //! modify members of the key.
1369
+ template <class DecomposerT>
1370
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1371
+ ::cuda::std::enable_if_t< //
1372
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1373
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1374
+ {
1375
+ SortBlocked(
1376
+ keys,
1377
+ values,
1378
+ 0,
1379
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1380
+ ::cuda::std::true_type(),
1381
+ detail::bool_constant_v<KEYS_ONLY>,
1382
+ decomposer);
1383
+ }
1384
+
1385
+ //! @} end member group
1386
+ //! @name Sorting (blocked arrangement -> striped arrangement)
1387
+ //! @{
1388
+
1389
+ //! @rst
1390
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
1391
+ //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1392
+ //!
1393
+ //! - @granularity
1394
+ //! - @smemreuse
1395
+ //!
1396
+ //! Snippet
1397
+ //! +++++++
1398
+ //!
1399
+ //! The code snippet below illustrates a sort of 512 integer keys that
1400
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1401
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1402
+ //!
1403
+ //! .. code-block:: c++
1404
+ //!
1405
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1406
+ //!
1407
+ //! __global__ void ExampleKernel(...)
1408
+ //! {
1409
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1410
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1411
+ //!
1412
+ //! // Allocate shared memory for BlockRadixSort
1413
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1414
+ //!
1415
+ //! // Obtain a segment of consecutive items that are blocked across threads
1416
+ //! int thread_keys[4];
1417
+ //! ...
1418
+ //!
1419
+ //! // Collectively sort the keys
1420
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1421
+ //!
1422
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1423
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1424
+ //! The corresponding output ``thread_keys`` in those threads will be
1425
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1426
+ //!
1427
+ //! @endrst
1428
+ //!
1429
+ //! @param[in,out] keys
1430
+ //! Keys to sort
1431
+ //!
1432
+ //! @param[in] begin_bit
1433
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1434
+ //!
1435
+ //! @param[in] end_bit
1436
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1437
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1438
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1439
+ {
1440
+ NullType values[ITEMS_PER_THREAD];
1441
+
1442
+ SortBlockedToStriped(
1443
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1444
+ }
1445
+
1446
+ //! @rst
1447
+ //! Performs an ascending block-wide radix sort over a
1448
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1449
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1450
+ //!
1451
+ //! * @granularity
1452
+ //! * @smemreuse
1453
+ //!
1454
+ //! Snippet
1455
+ //! ==========================================================================
1456
+ //!
1457
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1458
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1459
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1460
+ //! tuple of references to relevant members of the key.
1461
+ //!
1462
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1463
+ //! :language: c++
1464
+ //! :dedent:
1465
+ //! :start-after: example-begin custom-type
1466
+ //! :end-before: example-end custom-type
1467
+ //!
1468
+ //! The code snippet below illustrates a sort of 4 keys that
1469
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1470
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1471
+ //!
1472
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1473
+ //! :language: c++
1474
+ //! :dedent:
1475
+ //! :start-after: example-begin keys-striped-bits
1476
+ //! :end-before: example-end keys-striped-bits
1477
+ //!
1478
+ //! @endrst
1479
+ //!
1480
+ //! @tparam DecomposerT
1481
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1482
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1483
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1484
+ //! The leftmost element of the tuple is considered the most significant.
1485
+ //! The call operator must not modify members of the key.
1486
+ //!
1487
+ //! @param[in,out] keys
1488
+ //! Keys to sort
1489
+ //!
1490
+ //! @param decomposer
1491
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1492
+ //! references to its constituent arithmetic types. The leftmost element of
1493
+ //! the tuple is considered the most significant. The call operator must not
1494
+ //! modify members of the key.
1495
+ //!
1496
+ //! @param[in] begin_bit
1497
+ //! The least-significant bit index (inclusive) needed for
1498
+ //! key comparison
1499
+ //!
1500
+ //! @param[in] end_bit
1501
+ //! The most-significant bit index (exclusive) needed for key
1502
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1503
+ template <class DecomposerT>
1504
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1505
+ ::cuda::std::enable_if_t< //
1506
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1507
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1508
+ {
1509
+ NullType values[ITEMS_PER_THREAD];
1510
+
1511
+ SortBlockedToStriped(
1512
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1513
+ }
1514
+
1515
+ //! @rst
1516
+ //! Performs an ascending block-wide radix sort over a
1517
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1518
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1519
+ //!
1520
+ //! * @granularity
1521
+ //! * @smemreuse
1522
+ //!
1523
+ //! Snippet
1524
+ //! ==========================================================================
1525
+ //!
1526
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1527
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1528
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1529
+ //! tuple of references to relevant members of the key.
1530
+ //!
1531
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1532
+ //! :language: c++
1533
+ //! :dedent:
1534
+ //! :start-after: example-begin custom-type
1535
+ //! :end-before: example-end custom-type
1536
+ //!
1537
+ //! The code snippet below illustrates a sort of 6 keys that
1538
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1539
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1540
+ //!
1541
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1542
+ //! :language: c++
1543
+ //! :dedent:
1544
+ //! :start-after: example-begin keys-striped
1545
+ //! :end-before: example-end keys-striped
1546
+ //!
1547
+ //! @endrst
1548
+ //!
1549
+ //! @tparam DecomposerT
1550
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1551
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1552
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1553
+ //! The leftmost element of the tuple is considered the most significant.
1554
+ //! The call operator must not modify members of the key.
1555
+ //!
1556
+ //! @param[in,out] keys
1557
+ //! Keys to sort
1558
+ //!
1559
+ //! @param decomposer
1560
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1561
+ //! references to its constituent arithmetic types. The leftmost element of
1562
+ //! the tuple is considered the most significant. The call operator must not
1563
+ //! modify members of the key.
1564
+ template <class DecomposerT>
1565
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1566
+ ::cuda::std::enable_if_t< //
1567
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1568
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1569
+ {
1570
+ NullType values[ITEMS_PER_THREAD];
1571
+
1572
+ SortBlockedToStriped(
1573
+ keys,
1574
+ values,
1575
+ 0,
1576
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1577
+ ::cuda::std::false_type(),
1578
+ detail::bool_constant_v<KEYS_ONLY>,
1579
+ decomposer);
1580
+ }
1581
+
1582
+ //! @rst
1583
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
1584
+ //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1585
+ //!
1586
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1587
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1588
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1589
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1590
+ //! shared memory.
1591
+ //! - @granularity
1592
+ //! - @smemreuse
1593
+ //!
1594
+ //! Snippet
1595
+ //! +++++++
1596
+ //!
1597
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1598
+ //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
1599
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1600
+ //!
1601
+ //! .. code-block:: c++
1602
+ //!
1603
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1604
+ //!
1605
+ //! __global__ void ExampleKernel(...)
1606
+ //! {
1607
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1608
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1609
+ //!
1610
+ //! // Allocate shared memory for BlockRadixSort
1611
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1612
+ //!
1613
+ //! // Obtain a segment of consecutive items that are blocked across threads
1614
+ //! int thread_keys[4];
1615
+ //! int thread_values[4];
1616
+ //! ...
1617
+ //!
1618
+ //! // Collectively sort the keys and values among block threads
1619
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1620
+ //!
1621
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1622
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1623
+ //! The corresponding output ``thread_keys`` in those threads will be
1624
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1625
+ //!
1626
+ //! @endrst
1627
+ //!
1628
+ //! @param[in,out] keys
1629
+ //! Keys to sort
1630
+ //!
1631
+ //! @param[in,out] values
1632
+ //! Values to sort
1633
+ //!
1634
+ //! @param[in] begin_bit
1635
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1636
+ //!
1637
+ //! @param[in] end_bit
1638
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1639
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
1640
+ KeyT (&keys)[ITEMS_PER_THREAD],
1641
+ ValueT (&values)[ITEMS_PER_THREAD],
1642
+ int begin_bit = 0,
1643
+ int end_bit = sizeof(KeyT) * 8)
1644
+ {
1645
+ SortBlockedToStriped(
1646
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1647
+ }
1648
+
1649
+ //! @rst
1650
+ //! Performs an ascending block-wide radix sort over a
1651
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1652
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1653
+ //!
1654
+ //! * @granularity
1655
+ //! * @smemreuse
1656
+ //!
1657
+ //! Snippet
1658
+ //! ==========================================================================
1659
+ //!
1660
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1661
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1662
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1663
+ //! tuple of references to relevant members of the key.
1664
+ //!
1665
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1666
+ //! :language: c++
1667
+ //! :dedent:
1668
+ //! :start-after: example-begin custom-type
1669
+ //! :end-before: example-end custom-type
1670
+ //!
1671
+ //! The code snippet below illustrates a sort of 4 pairs that
1672
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1673
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
1674
+ //!
1675
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1676
+ //! :language: c++
1677
+ //! :dedent:
1678
+ //! :start-after: example-begin pairs-striped-bits
1679
+ //! :end-before: example-end pairs-striped-bits
1680
+ //!
1681
+ //! @endrst
1682
+ //!
1683
+ //! @tparam DecomposerT
1684
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1685
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1686
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1687
+ //! The leftmost element of the tuple is considered the most significant.
1688
+ //! The call operator must not modify members of the key.
1689
+ //!
1690
+ //! @param[in,out] keys
1691
+ //! Keys to sort
1692
+ //!
1693
+ //! @param[in,out] values
1694
+ //! Values to sort
1695
+ //!
1696
+ //! @param decomposer
1697
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1698
+ //! references to its constituent arithmetic types. The leftmost element of
1699
+ //! the tuple is considered the most significant. The call operator must not
1700
+ //! modify members of the key.
1701
+ //!
1702
+ //! @param[in] begin_bit
1703
+ //! The least-significant bit index (inclusive) needed for
1704
+ //! key comparison
1705
+ //!
1706
+ //! @param[in] end_bit
1707
+ //! The most-significant bit index (exclusive) needed for key
1708
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1709
+ template <class DecomposerT>
1710
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1711
+ ::cuda::std::enable_if_t< //
1712
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1713
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
1714
+ ValueT (&values)[ITEMS_PER_THREAD],
1715
+ DecomposerT decomposer,
1716
+ int begin_bit,
1717
+ int end_bit)
1718
+ {
1719
+ SortBlockedToStriped(
1720
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1721
+ }
1722
+
1723
+ //! @rst
1724
+ //! Performs an ascending block-wide radix sort over a
1725
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1726
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1727
+ //!
1728
+ //! * @granularity
1729
+ //! * @smemreuse
1730
+ //!
1731
+ //! Snippet
1732
+ //! ==========================================================================
1733
+ //!
1734
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1735
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1736
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1737
+ //! tuple of references to relevant members of the key.
1738
+ //!
1739
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1740
+ //! :language: c++
1741
+ //! :dedent:
1742
+ //! :start-after: example-begin custom-type
1743
+ //! :end-before: example-end custom-type
1744
+ //!
1745
+ //! The code snippet below illustrates a sort of 6 pairs that
1746
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1747
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
1748
+ //!
1749
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1750
+ //! :language: c++
1751
+ //! :dedent:
1752
+ //! :start-after: example-begin pairs-striped
1753
+ //! :end-before: example-end pairs-striped
1754
+ //!
1755
+ //! @endrst
1756
+ //!
1757
+ //! @tparam DecomposerT
1758
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1759
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1760
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1761
+ //! The leftmost element of the tuple is considered the most significant.
1762
+ //! The call operator must not modify members of the key.
1763
+ //!
1764
+ //! @param[in,out] keys
1765
+ //! Keys to sort
1766
+ //!
1767
+ //! @param[in,out] values
1768
+ //! Values to sort
1769
+ //!
1770
+ //! @param decomposer
1771
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1772
+ //! references to its constituent arithmetic types. The leftmost element of
1773
+ //! the tuple is considered the most significant. The call operator must not
1774
+ //! modify members of the key.
1775
+ template <class DecomposerT>
1776
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1777
+ ::cuda::std::enable_if_t< //
1778
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1779
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1780
+ {
1781
+ SortBlockedToStriped(
1782
+ keys,
1783
+ values,
1784
+ 0,
1785
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1786
+ ::cuda::std::false_type(),
1787
+ detail::bool_constant_v<KEYS_ONLY>,
1788
+ decomposer);
1789
+ }
1790
+
1791
+ //! @rst
1792
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1793
+ //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1794
+ //!
1795
+ //! - @granularity
1796
+ //! - @smemreuse
1797
+ //!
1798
+ //! Snippet
1799
+ //! +++++++
1800
+ //!
1801
+ //! The code snippet below illustrates a sort of 512 integer keys that
1802
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1803
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1804
+ //!
1805
+ //! .. code-block:: c++
1806
+ //!
1807
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1808
+ //!
1809
+ //! __global__ void ExampleKernel(...)
1810
+ //! {
1811
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1812
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1813
+ //!
1814
+ //! // Allocate shared memory for BlockRadixSort
1815
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1816
+ //!
1817
+ //! // Obtain a segment of consecutive items that are blocked across threads
1818
+ //! int thread_keys[4];
1819
+ //! ...
1820
+ //!
1821
+ //! // Collectively sort the keys
1822
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1823
+ //!
1824
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1825
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1826
+ //! The corresponding output ``thread_keys`` in those threads will be
1827
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1828
+ //!
1829
+ //! @endrst
1830
+ //!
1831
+ //! @param[in,out] keys
1832
+ //! Keys to sort
1833
+ //!
1834
+ //! @param[in] begin_bit
1835
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1836
+ //!
1837
+ //! @param[in] end_bit
1838
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1839
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1840
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1841
+ {
1842
+ NullType values[ITEMS_PER_THREAD];
1843
+
1844
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1845
+ }
1846
+
1847
+ //! @rst
1848
+ //! Performs a descending block-wide radix sort over a
1849
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1850
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1851
+ //!
1852
+ //! * @granularity
1853
+ //! * @smemreuse
1854
+ //!
1855
+ //! Snippet
1856
+ //! ==========================================================================
1857
+ //!
1858
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1859
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1860
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1861
+ //! tuple of references to relevant members of the key.
1862
+ //!
1863
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1864
+ //! :language: c++
1865
+ //! :dedent:
1866
+ //! :start-after: example-begin custom-type
1867
+ //! :end-before: example-end custom-type
1868
+ //!
1869
+ //! The code snippet below illustrates a sort of 4 keys that
1870
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1871
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1872
+ //!
1873
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1874
+ //! :language: c++
1875
+ //! :dedent:
1876
+ //! :start-after: example-begin keys-striped-descending-bits
1877
+ //! :end-before: example-end keys-striped-descending-bits
1878
+ //!
1879
+ //! @endrst
1880
+ //!
1881
+ //! @tparam DecomposerT
1882
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1883
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1884
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1885
+ //! The leftmost element of the tuple is considered the most significant.
1886
+ //! The call operator must not modify members of the key.
1887
+ //!
1888
+ //! @param[in,out] keys
1889
+ //! Keys to sort
1890
+ //!
1891
+ //! @param decomposer
1892
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1893
+ //! references to its constituent arithmetic types. The leftmost element of
1894
+ //! the tuple is considered the most significant. The call operator must not
1895
+ //! modify members of the key.
1896
+ //!
1897
+ //! @param[in] begin_bit
1898
+ //! The least-significant bit index (inclusive) needed for
1899
+ //! key comparison
1900
+ //!
1901
+ //! @param[in] end_bit
1902
+ //! The most-significant bit index (exclusive) needed for key
1903
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1904
+ template <class DecomposerT>
1905
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1906
+ ::cuda::std::enable_if_t< //
1907
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1908
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1909
+ {
1910
+ NullType values[ITEMS_PER_THREAD];
1911
+
1912
+ SortBlockedToStriped(
1913
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1914
+ }
1915
+
1916
+ //! @rst
1917
+ //! Performs a descending block-wide radix sort over a
1918
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1919
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1920
+ //!
1921
+ //! * @granularity
1922
+ //! * @smemreuse
1923
+ //!
1924
+ //! Snippet
1925
+ //! ==========================================================================
1926
+ //!
1927
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1928
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1929
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1930
+ //! tuple of references to relevant members of the key.
1931
+ //!
1932
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1933
+ //! :language: c++
1934
+ //! :dedent:
1935
+ //! :start-after: example-begin custom-type
1936
+ //! :end-before: example-end custom-type
1937
+ //!
1938
+ //! The code snippet below illustrates a sort of 6 keys that
1939
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1940
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1941
+ //!
1942
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1943
+ //! :language: c++
1944
+ //! :dedent:
1945
+ //! :start-after: example-begin keys-striped-descending
1946
+ //! :end-before: example-end keys-striped-descending
1947
+ //!
1948
+ //! @endrst
1949
+ //!
1950
+ //! @tparam DecomposerT
1951
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1952
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1953
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1954
+ //! The leftmost element of the tuple is considered the most significant.
1955
+ //! The call operator must not modify members of the key.
1956
+ //!
1957
+ //! @param[in,out] keys
1958
+ //! Keys to sort
1959
+ //!
1960
+ //! @param decomposer
1961
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1962
+ //! references to its constituent arithmetic types. The leftmost element of
1963
+ //! the tuple is considered the most significant. The call operator must not
1964
+ //! modify members of the key.
1965
+ template <class DecomposerT>
1966
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1967
+ ::cuda::std::enable_if_t< //
1968
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1969
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1970
+ {
1971
+ NullType values[ITEMS_PER_THREAD];
1972
+
1973
+ SortBlockedToStriped(
1974
+ keys,
1975
+ values,
1976
+ 0,
1977
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1978
+ ::cuda::std::true_type(),
1979
+ detail::bool_constant_v<KEYS_ONLY>,
1980
+ decomposer);
1981
+ }
1982
+
1983
+ //! @rst
1984
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1985
+ //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
1986
+ //!
1987
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1988
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1989
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1990
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1991
+ //! shared memory.
1992
+ //! - @granularity
1993
+ //! - @smemreuse
1994
+ //!
1995
+ //! Snippet
1996
+ //! +++++++
1997
+ //!
1998
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1999
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
2000
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
2001
+ //!
2002
+ //! .. code-block:: c++
2003
+ //!
2004
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
2005
+ //!
2006
+ //! __global__ void ExampleKernel(...)
2007
+ //! {
2008
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
2009
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
2010
+ //!
2011
+ //! // Allocate shared memory for BlockRadixSort
2012
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
2013
+ //!
2014
+ //! // Obtain a segment of consecutive items that are blocked across threads
2015
+ //! int thread_keys[4];
2016
+ //! int thread_values[4];
2017
+ //! ...
2018
+ //!
2019
+ //! // Collectively sort the keys and values among block threads
2020
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
2021
+ //!
2022
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
2023
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
2024
+ //! The corresponding output ``thread_keys`` in those threads will be
2025
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
2026
+ //!
2027
+ //! @endrst
2028
+ //!
2029
+ //! @param[in,out] keys
2030
+ //! Keys to sort
2031
+ //!
2032
+ //! @param[in,out] values
2033
+ //! Values to sort
2034
+ //!
2035
+ //! @param[in] begin_bit
2036
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
2037
+ //!
2038
+ //! @param[in] end_bit
2039
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
2040
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
2041
+ KeyT (&keys)[ITEMS_PER_THREAD],
2042
+ ValueT (&values)[ITEMS_PER_THREAD],
2043
+ int begin_bit = 0,
2044
+ int end_bit = sizeof(KeyT) * 8)
2045
+ {
2046
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
2047
+ }
2048
+
2049
+ //! @rst
2050
+ //! Performs a descending block-wide radix sort over a
2051
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2052
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2053
+ //!
2054
+ //! * @granularity
2055
+ //! * @smemreuse
2056
+ //!
2057
+ //! Snippet
2058
+ //! ==========================================================================
2059
+ //!
2060
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2061
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2062
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2063
+ //! tuple of references to relevant members of the key.
2064
+ //!
2065
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2066
+ //! :language: c++
2067
+ //! :dedent:
2068
+ //! :start-after: example-begin custom-type
2069
+ //! :end-before: example-end custom-type
2070
+ //!
2071
+ //! The code snippet below illustrates a sort of 4 keys and values that
2072
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2073
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
2074
+ //!
2075
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2076
+ //! :language: c++
2077
+ //! :dedent:
2078
+ //! :start-after: example-begin pairs-striped-descending-bits
2079
+ //! :end-before: example-end pairs-striped-descending-bits
2080
+ //!
2081
+ //! @endrst
2082
+ //!
2083
+ //! @tparam DecomposerT
2084
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2085
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2086
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2087
+ //! The leftmost element of the tuple is considered the most significant.
2088
+ //! The call operator must not modify members of the key.
2089
+ //!
2090
+ //! @param[in,out] keys
2091
+ //! Keys to sort
2092
+ //!
2093
+ //! @param[in,out] values
2094
+ //! Values to sort
2095
+ //!
2096
+ //! @param decomposer
2097
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2098
+ //! references to its constituent arithmetic types. The leftmost element of
2099
+ //! the tuple is considered the most significant. The call operator must not
2100
+ //! modify members of the key.
2101
+ //!
2102
+ //! @param[in] begin_bit
2103
+ //! The least-significant bit index (inclusive) needed for
2104
+ //! key comparison
2105
+ //!
2106
+ //! @param[in] end_bit
2107
+ //! The most-significant bit index (exclusive) needed for key
2108
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2109
+ template <class DecomposerT>
2110
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2111
+ ::cuda::std::enable_if_t< //
2112
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2113
+ SortDescendingBlockedToStriped(
2114
+ KeyT (&keys)[ITEMS_PER_THREAD],
2115
+ ValueT (&values)[ITEMS_PER_THREAD],
2116
+ DecomposerT decomposer,
2117
+ int begin_bit,
2118
+ int end_bit)
2119
+ {
2120
+ SortBlockedToStriped(
2121
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
2122
+ }
2123
+
2124
+ //! @rst
2125
+ //! Performs a descending block-wide radix sort over a
2126
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2127
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2128
+ //!
2129
+ //! * @granularity
2130
+ //! * @smemreuse
2131
+ //!
2132
+ //! Snippet
2133
+ //! ==========================================================================
2134
+ //!
2135
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2136
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2137
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2138
+ //! tuple of references to relevant members of the key.
2139
+ //!
2140
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2141
+ //! :language: c++
2142
+ //! :dedent:
2143
+ //! :start-after: example-begin custom-type
2144
+ //! :end-before: example-end custom-type
2145
+ //!
2146
+ //! The code snippet below illustrates a sort of 6 keys and values that
2147
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2148
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
2149
+ //!
2150
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2151
+ //! :language: c++
2152
+ //! :dedent:
2153
+ //! :start-after: example-begin pairs-striped-descending
2154
+ //! :end-before: example-end pairs-striped-descending
2155
+ //!
2156
+ //! @endrst
2157
+ //!
2158
+ //! @tparam DecomposerT
2159
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2160
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2161
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2162
+ //! The leftmost element of the tuple is considered the most significant.
2163
+ //! The call operator must not modify members of the key.
2164
+ //!
2165
+ //! @param[in,out] keys
2166
+ //! Keys to sort
2167
+ //!
2168
+ //! @param[in,out] values
2169
+ //! Values to sort
2170
+ //!
2171
+ //! @param decomposer
2172
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2173
+ //! references to its constituent arithmetic types. The leftmost element of
2174
+ //! the tuple is considered the most significant. The call operator must not
2175
+ //! modify members of the key.
2176
+ template <class DecomposerT>
2177
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2178
+ ::cuda::std::enable_if_t< //
2179
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2180
+ SortDescendingBlockedToStriped(
2181
+ KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
2182
+ {
2183
+ SortBlockedToStriped(
2184
+ keys,
2185
+ values,
2186
+ 0,
2187
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
2188
+ ::cuda::std::true_type(),
2189
+ detail::bool_constant_v<KEYS_ONLY>,
2190
+ decomposer);
2191
+ }
2192
+
2193
+ //@} end member group
2194
+ };
2195
+
2196
+ CUB_NAMESPACE_END