cuda-cccl 0.3.4__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1926) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +677 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +722 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +761 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +282 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +702 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +552 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1095 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +562 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1088 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +320 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +605 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1399 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1203 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +400 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1242 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +416 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1203 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2132 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +126 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +642 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2287 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +322 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1223 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +216 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +214 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
  52. cuda/cccl/headers/include/cub/config.cuh +29 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +96 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +86 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +140 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +98 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +66 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +41 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +39 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +71 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +79 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +39 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2497 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2187 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1406 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +172 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1026 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +449 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1719 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1283 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +504 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +312 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +491 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +577 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +951 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +818 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +339 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +455 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +541 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +521 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +497 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +801 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +557 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +163 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +255 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +52 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1063 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +468 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +594 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +456 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +178 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +296 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +324 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +175 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +141 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +759 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +92 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +151 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +489 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +96 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1093 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
  177. cuda/cccl/headers/include/cub/version.cuh +65 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +713 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +928 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1866 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +529 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  208. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  209. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  211. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  212. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  213. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  214. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  216. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  217. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  218. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  219. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  220. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  223. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  224. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  225. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  226. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  227. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  228. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  230. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  231. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  232. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  233. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  234. cuda/cccl/headers/include/cuda/__driver/driver_api.h +848 -0
  235. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  236. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  237. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  238. cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
  239. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
  240. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  241. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  242. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  243. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  244. cuda/cccl/headers/include/cuda/__functional/maximum.h +76 -0
  245. cuda/cccl/headers/include/cuda/__functional/minimum.h +76 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  250. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  251. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  252. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  253. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  254. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  255. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +492 -0
  256. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  257. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  258. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  259. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  260. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  261. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  264. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  268. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  269. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +532 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  273. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +81 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +103 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +58 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  301. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  302. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  303. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  304. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +159 -0
  308. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +316 -0
  309. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  310. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  311. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  313. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  424. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  425. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  426. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  427. cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
  428. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  429. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  430. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  431. cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
  432. cuda/cccl/headers/include/cuda/__stream/internal_streams.h +44 -0
  433. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  434. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  435. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  436. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +591 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +163 -0
  455. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  456. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  457. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  458. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  459. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  460. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  461. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  462. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  463. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  464. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  465. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  466. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  467. cuda/cccl/headers/include/cuda/access_property +26 -0
  468. cuda/cccl/headers/include/cuda/algorithm +27 -0
  469. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  470. cuda/cccl/headers/include/cuda/atomic +27 -0
  471. cuda/cccl/headers/include/cuda/barrier +293 -0
  472. cuda/cccl/headers/include/cuda/bit +29 -0
  473. cuda/cccl/headers/include/cuda/cmath +37 -0
  474. cuda/cccl/headers/include/cuda/devices +33 -0
  475. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  476. cuda/cccl/headers/include/cuda/functional +32 -0
  477. cuda/cccl/headers/include/cuda/iterator +39 -0
  478. cuda/cccl/headers/include/cuda/latch +27 -0
  479. cuda/cccl/headers/include/cuda/mdspan +28 -0
  480. cuda/cccl/headers/include/cuda/memory +36 -0
  481. cuda/cccl/headers/include/cuda/memory_resource +40 -0
  482. cuda/cccl/headers/include/cuda/numeric +31 -0
  483. cuda/cccl/headers/include/cuda/pipeline +580 -0
  484. cuda/cccl/headers/include/cuda/ptx +129 -0
  485. cuda/cccl/headers/include/cuda/semaphore +31 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
  578. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  579. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  580. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  581. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4437 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  600. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
  601. cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
  602. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  603. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  604. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  605. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  606. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  613. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  614. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  615. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  616. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +645 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +130 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +354 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  638. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +289 -0
  639. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  640. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  641. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  642. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  643. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  644. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  645. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  646. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  647. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  648. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  650. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  651. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
  654. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
  655. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
  656. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
  657. cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
  658. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
  660. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
  661. cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +204 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +185 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  681. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  682. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  683. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
  684. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
  685. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
  686. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  687. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
  688. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  696. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  697. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  698. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  699. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  700. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  701. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  702. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  703. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +367 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  719. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  720. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  721. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  722. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  723. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  724. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  725. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  726. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  727. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  728. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  729. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  730. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  731. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  732. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  733. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  734. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  735. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +118 -0
  736. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  737. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
  739. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  740. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  741. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  742. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  743. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  744. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  745. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  754. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  755. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  756. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  757. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  758. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  759. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  760. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  761. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  762. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  763. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  764. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  765. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  766. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  767. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  768. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  769. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  770. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  771. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  772. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  773. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  774. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  775. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  776. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  777. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  778. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  779. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  780. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  781. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  801. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  802. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  803. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  804. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  805. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  806. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  807. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  808. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  820. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  821. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  822. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  823. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  824. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
  825. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  826. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  827. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  828. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  829. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
  830. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  831. cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
  832. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  833. cuda/cccl/headers/include/cuda/std/__internal/features.h +86 -0
  834. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  860. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  861. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  862. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  864. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  865. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  866. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  867. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
  868. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  869. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  870. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  871. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  872. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  873. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
  874. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  875. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +77 -0
  876. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
  877. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +137 -0
  878. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
  879. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +316 -0
  880. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  881. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
  882. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  884. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +753 -0
  885. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  886. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  887. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +603 -0
  888. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  889. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  890. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  891. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +526 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  901. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  902. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  903. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +242 -0
  904. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  905. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
  906. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  907. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  909. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +679 -0
  910. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  911. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
  912. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  913. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  914. cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
  915. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  916. cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
  917. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  918. cuda/cccl/headers/include/cuda/std/__new_ +30 -0
  919. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  920. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  921. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  922. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  923. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  924. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  925. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  926. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  927. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  928. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  929. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  930. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  931. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  932. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  933. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  934. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  935. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  936. cuda/cccl/headers/include/cuda/std/__optional/optional.h +860 -0
  937. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  938. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  939. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  940. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  941. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  942. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  943. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  944. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  945. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  946. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  947. cuda/cccl/headers/include/cuda/std/__random_ +31 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  961. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +408 -0
  962. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  963. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  964. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  965. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  966. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  967. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  968. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  969. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  970. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  971. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  972. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  973. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  974. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  975. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  976. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  977. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  978. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  979. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  980. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  981. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  982. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  983. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  984. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  985. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  986. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  987. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  988. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  989. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  990. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  991. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  992. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  993. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  994. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  995. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
  996. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  997. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  998. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  999. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  1000. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  1001. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
  1002. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  1003. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  1004. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
  1005. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  1006. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  1007. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  1008. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1126. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1127. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1128. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1129. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1130. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1131. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1132. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1133. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1134. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1135. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1136. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1137. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1138. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1139. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1140. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1141. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1142. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1143. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1144. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1145. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1147. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1148. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1149. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1150. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1151. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1152. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1153. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1154. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1155. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1156. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1157. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1158. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1159. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1160. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
  1161. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1162. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1163. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1164. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1165. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1166. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1167. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
  1168. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1169. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1170. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1171. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1172. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1173. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  1174. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  1175. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  1176. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  1177. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  1178. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  1179. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  1180. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  1181. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  1182. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  1183. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  1184. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  1185. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  1186. cuda/cccl/headers/include/cuda/std/array +518 -0
  1187. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1188. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1189. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1190. cuda/cccl/headers/include/cuda/std/bitset +986 -0
  1191. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1192. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1193. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1194. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1195. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1196. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1197. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1198. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1199. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1200. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1201. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1202. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1203. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1204. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1205. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1206. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1718 -0
  1207. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
  1208. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1209. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1210. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1211. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1212. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1213. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1214. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1215. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1216. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1217. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1218. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1219. cuda/cccl/headers/include/cuda/std/numbers +344 -0
  1220. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1221. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1222. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1223. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1224. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1225. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1226. cuda/cccl/headers/include/cuda/std/span +628 -0
  1227. cuda/cccl/headers/include/cuda/std/string_view +923 -0
  1228. cuda/cccl/headers/include/cuda/std/tuple +43 -0
  1229. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1230. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1231. cuda/cccl/headers/include/cuda/std/variant +32 -0
  1232. cuda/cccl/headers/include/cuda/std/version +240 -0
  1233. cuda/cccl/headers/include/cuda/stream +31 -0
  1234. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1235. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1236. cuda/cccl/headers/include/cuda/utility +28 -0
  1237. cuda/cccl/headers/include/cuda/version +16 -0
  1238. cuda/cccl/headers/include/cuda/warp +28 -0
  1239. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1240. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1241. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1242. cuda/cccl/headers/include/nv/target +236 -0
  1243. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1244. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1245. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1246. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1247. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1248. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1249. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1250. cuda/cccl/headers/include/thrust/count.h +245 -0
  1251. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
  1252. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1253. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +624 -0
  1254. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +191 -0
  1255. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
  1256. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
  1257. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
  1258. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
  1259. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
  1260. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
  1261. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1262. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +95 -0
  1263. cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
  1264. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1265. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +250 -0
  1266. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +58 -0
  1267. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +869 -0
  1268. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +583 -0
  1269. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +227 -0
  1270. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +155 -0
  1271. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +190 -0
  1272. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +167 -0
  1273. cuda/cccl/headers/include/thrust/detail/complex/clog.h +217 -0
  1274. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +204 -0
  1275. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1276. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1277. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +76 -0
  1278. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +222 -0
  1279. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +162 -0
  1280. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +172 -0
  1281. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +168 -0
  1282. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +202 -0
  1283. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +127 -0
  1284. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +132 -0
  1285. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1286. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1287. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1288. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1289. cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
  1290. cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
  1291. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1292. cuda/cccl/headers/include/thrust/detail/config/namespace.h +161 -0
  1293. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1294. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1295. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +226 -0
  1296. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +271 -0
  1297. cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
  1298. cuda/cccl/headers/include/thrust/detail/copy.inl +139 -0
  1299. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1300. cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
  1301. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1302. cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
  1303. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1304. cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
  1305. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1306. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
  1307. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1308. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1309. cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
  1310. cuda/cccl/headers/include/thrust/detail/fill.inl +98 -0
  1311. cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
  1312. cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
  1313. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1314. cuda/cccl/headers/include/thrust/detail/functional/actor.h +211 -0
  1315. cuda/cccl/headers/include/thrust/detail/functional/operators.h +383 -0
  1316. cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
  1317. cuda/cccl/headers/include/thrust/detail/generate.inl +98 -0
  1318. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
  1319. cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
  1320. cuda/cccl/headers/include/thrust/detail/internal_functional.h +329 -0
  1321. cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
  1322. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
  1323. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1324. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1325. cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
  1326. cuda/cccl/headers/include/thrust/detail/mismatch.inl +106 -0
  1327. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
  1328. cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
  1329. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1330. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1331. cuda/cccl/headers/include/thrust/detail/random_bijection.h +175 -0
  1332. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
  1333. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
  1334. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1335. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +186 -0
  1336. cuda/cccl/headers/include/thrust/detail/reduce.inl +395 -0
  1337. cuda/cccl/headers/include/thrust/detail/reference.h +518 -0
  1338. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1339. cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
  1340. cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
  1341. cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
  1342. cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
  1343. cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
  1344. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1345. cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
  1346. cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
  1347. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1348. cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
  1349. cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
  1350. cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
  1351. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
  1352. cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
  1353. cuda/cccl/headers/include/thrust/detail/temporary_array.h +149 -0
  1354. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +119 -0
  1355. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +92 -0
  1356. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
  1357. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
  1358. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
  1359. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
  1360. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1361. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1362. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1363. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
  1364. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
  1365. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1366. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
  1367. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1368. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1369. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +328 -0
  1370. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1371. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
  1372. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +98 -0
  1373. cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
  1374. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1375. cuda/cccl/headers/include/thrust/detail/vector_base.h +611 -0
  1376. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1208 -0
  1377. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1378. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1379. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1380. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1381. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1382. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1383. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1384. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1385. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1386. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1387. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1388. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1389. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1390. cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
  1391. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1392. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1393. cuda/cccl/headers/include/thrust/find.h +382 -0
  1394. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1395. cuda/cccl/headers/include/thrust/functional.h +393 -0
  1396. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1397. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1398. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1399. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1400. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1401. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1402. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1403. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1404. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
  1405. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
  1406. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +80 -0
  1407. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1408. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1409. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
  1410. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +181 -0
  1411. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +57 -0
  1412. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1413. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1414. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
  1415. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +170 -0
  1416. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1417. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1418. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1419. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1420. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1421. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1422. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1423. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1424. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1425. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1426. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1427. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1428. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1429. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
  1430. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +349 -0
  1431. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1432. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1433. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1434. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1435. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1436. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1437. cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
  1438. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
  1439. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1440. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
  1441. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
  1442. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
  1443. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
  1444. cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
  1445. cuda/cccl/headers/include/thrust/mr/new.h +98 -0
  1446. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
  1447. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1448. cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
  1449. cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
  1450. cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
  1451. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1452. cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
  1453. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1454. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1455. cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
  1456. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
  1457. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
  1458. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
  1459. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
  1460. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
  1461. cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
  1462. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
  1463. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +155 -0
  1464. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
  1465. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
  1466. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
  1467. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1468. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
  1469. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1470. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
  1471. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
  1472. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
  1473. cuda/cccl/headers/include/thrust/random/normal_distribution.h +255 -0
  1474. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
  1475. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
  1476. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +256 -0
  1477. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
  1478. cuda/cccl/headers/include/thrust/random.h +118 -0
  1479. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1480. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1481. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1482. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1483. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1484. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1485. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1486. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1487. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1488. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1489. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1522. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1523. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1524. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1525. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1527. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1528. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1529. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1530. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1531. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1533. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1534. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1535. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
  1537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
  1538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
  1539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +215 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +282 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +163 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +586 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +73 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +231 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +472 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +82 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +58 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +204 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +780 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +997 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +338 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +411 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +89 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1732 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +468 -0
  1585. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1586. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1587. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +130 -0
  1588. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1589. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
  1590. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
  1591. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
  1592. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +111 -0
  1593. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +100 -0
  1594. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +286 -0
  1595. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +306 -0
  1596. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1597. cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
  1598. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1599. cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
  1600. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
  1601. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
  1602. cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +381 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +143 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +64 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +249 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +62 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +66 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +205 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +124 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +103 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +280 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +173 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +52 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +52 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +80 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +111 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +79 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +134 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +108 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +297 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +96 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +354 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +113 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +104 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
  1740. cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
  1742. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
  1743. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
  1744. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
  1745. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
  1746. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
  1747. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
  1748. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
  1749. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
  1750. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
  1751. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
  1752. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
  1753. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +73 -0
  1754. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
  1755. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
  1756. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +83 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +62 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +49 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +189 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +51 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +55 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
  1793. cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +114 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +70 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
  1807. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
  1808. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
  1809. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
  1810. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
  1811. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
  1812. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
  1813. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
  1814. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
  1815. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +78 -0
  1816. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
  1817. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +120 -0
  1818. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
  1819. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
  1820. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
  1821. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
  1822. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
  1823. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +272 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +50 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +54 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
  1844. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1845. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1846. cuda/cccl/headers/include/thrust/transform.h +1056 -0
  1847. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1848. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1849. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1850. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +257 -0
  1851. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +153 -0
  1852. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1853. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
  1854. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
  1855. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +332 -0
  1856. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1857. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1858. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1859. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1860. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1861. cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
  1862. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1863. cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
  1864. cuda/cccl/headers/include/thrust/version.h +93 -0
  1865. cuda/cccl/headers/include/thrust/zip_function.h +150 -0
  1866. cuda/cccl/headers/include_paths.py +51 -0
  1867. cuda/cccl/parallel/__init__.py +9 -0
  1868. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1869. cuda/cccl/py.typed +0 -0
  1870. cuda/compute/__init__.py +83 -0
  1871. cuda/compute/_bindings.py +79 -0
  1872. cuda/compute/_bindings.pyi +498 -0
  1873. cuda/compute/_bindings_impl.pyx +2415 -0
  1874. cuda/compute/_caching.py +71 -0
  1875. cuda/compute/_cccl_interop.py +422 -0
  1876. cuda/compute/_utils/__init__.py +0 -0
  1877. cuda/compute/_utils/protocols.py +132 -0
  1878. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1879. cuda/compute/algorithms/__init__.py +58 -0
  1880. cuda/compute/algorithms/_histogram.py +243 -0
  1881. cuda/compute/algorithms/_reduce.py +182 -0
  1882. cuda/compute/algorithms/_scan.py +331 -0
  1883. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1884. cuda/compute/algorithms/_sort/__init__.py +23 -0
  1885. cuda/compute/algorithms/_sort/_merge_sort.py +225 -0
  1886. cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
  1887. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  1888. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  1889. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1890. cuda/compute/algorithms/_transform.py +329 -0
  1891. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1892. cuda/compute/cccl/.gitkeep +0 -0
  1893. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1894. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  1895. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  1896. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  1897. cuda/compute/iterators/__init__.py +21 -0
  1898. cuda/compute/iterators/_factories.py +219 -0
  1899. cuda/compute/iterators/_iterators.py +817 -0
  1900. cuda/compute/iterators/_zip_iterator.py +199 -0
  1901. cuda/compute/numba_utils.py +53 -0
  1902. cuda/compute/op.py +3 -0
  1903. cuda/compute/struct.py +272 -0
  1904. cuda/compute/typing.py +37 -0
  1905. cuda/coop/__init__.py +8 -0
  1906. cuda/coop/_caching.py +48 -0
  1907. cuda/coop/_common.py +275 -0
  1908. cuda/coop/_nvrtc.py +92 -0
  1909. cuda/coop/_scan_op.py +181 -0
  1910. cuda/coop/_types.py +937 -0
  1911. cuda/coop/_typing.py +107 -0
  1912. cuda/coop/block/__init__.py +39 -0
  1913. cuda/coop/block/_block_exchange.py +251 -0
  1914. cuda/coop/block/_block_load_store.py +215 -0
  1915. cuda/coop/block/_block_merge_sort.py +125 -0
  1916. cuda/coop/block/_block_radix_sort.py +214 -0
  1917. cuda/coop/block/_block_reduce.py +294 -0
  1918. cuda/coop/block/_block_scan.py +983 -0
  1919. cuda/coop/warp/__init__.py +9 -0
  1920. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1921. cuda/coop/warp/_warp_reduce.py +153 -0
  1922. cuda/coop/warp/_warp_scan.py +78 -0
  1923. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  1924. cuda_cccl-0.3.4.dist-info/RECORD +1926 -0
  1925. cuda_cccl-0.3.4.dist-info/WHEEL +5 -0
  1926. cuda_cccl-0.3.4.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2132 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
4
+
5
+ /**
6
+ * @file
7
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
8
+ * sorting of items partitioned across a CUDA thread block.
9
+ */
10
+
11
+ #pragma once
12
+
13
+ #include <cub/config.cuh>
14
+
15
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16
+ # pragma GCC system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18
+ # pragma clang system_header
19
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20
+ # pragma system_header
21
+ #endif // no system header
22
+
23
+ #include <cub/block/block_exchange.cuh>
24
+ #include <cub/block/block_radix_rank.cuh>
25
+ #include <cub/block/radix_rank_sort_operations.cuh>
26
+ #include <cub/util_ptx.cuh>
27
+ #include <cub/util_type.cuh>
28
+
29
+ #include <cuda/std/__algorithm/min.h>
30
+ #include <cuda/std/__type_traits/enable_if.h>
31
+ #include <cuda/std/__type_traits/integral_constant.h>
32
+ #include <cuda/std/__type_traits/is_convertible.h>
33
+ #include <cuda/std/__type_traits/is_same.h>
34
+
35
+ CUB_NAMESPACE_BEGIN
36
+
37
+ //! @rst
38
+ //! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
39
+ //! items partitioned across a CUDA thread block using a radix sorting method.
40
+ //!
41
+ //! .. image:: ../../img/sorting_logo.png
42
+ //! :align: center
43
+ //!
44
+ //! Overview
45
+ //! --------------------------------------------------
46
+ //!
47
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
48
+ //! items into ascending order. It relies upon a positional representation for
49
+ //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
50
+ //! characters, etc.) specified from least-significant to most-significant. For a
51
+ //! given input sequence of keys and a set of rules specifying a total ordering
52
+ //! of the symbolic alphabet, the radix sorting method produces a lexicographic
53
+ //! ordering of those keys.
54
+ //!
55
+ //! @rowmajor
56
+ //!
57
+ //! Supported Types
58
+ //! --------------------------------------------------
59
+ //!
60
+ //! BlockRadixSort can sort all of the built-in C++ numeric primitive types
61
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
62
+ //! half-precision floating-point type. User-defined types are supported as long
63
+ //! as decomposer object is provided.
64
+ //!
65
+ //! Floating-Point Special Cases
66
+ //! --------------------------------------------------
67
+ //!
68
+ //! - Positive and negative zeros are considered equivalent, and will be treated
69
+ //! as such in the output.
70
+ //! - No special handling is implemented for NaN values; these are sorted
71
+ //! according to their bit representations after any transformations.
72
+ //!
73
+ //! Bitwise Key Transformations
74
+ //! --------------------------------------------------
75
+ //!
76
+ //! Although the direct radix sorting method can only be applied to unsigned
77
+ //! integral types, BlockRadixSort is able to sort signed and floating-point
78
+ //! types via simple bit-wise transformations that ensure lexicographic key
79
+ //! ordering.
80
+ //!
81
+ //! These transformations must be considered when restricting the
82
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
83
+ //! before the bit-range truncation.
84
+ //!
85
+ //! Any transformations applied to the keys prior to sorting are reversed
86
+ //! while writing to the final output buffer.
87
+ //!
88
+ //! Type Specific Bitwise Transformations
89
+ //! --------------------------------------------------
90
+ //!
91
+ //! To convert the input values into a radix-sortable bitwise representation,
92
+ //! the following transformations take place prior to sorting:
93
+ //!
94
+ //! * For unsigned integral values, the keys are used directly.
95
+ //! * For signed integral values, the sign bit is inverted.
96
+ //! * For positive floating point values, the sign bit is inverted.
97
+ //! * For negative floating point values, the full key is inverted.
98
+ //!
99
+ //! No Descending Sort Transformations
100
+ //! --------------------------------------------------
101
+ //!
102
+ //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
103
+ //! when performing a descending sort. Instead, it has special logic to reverse
104
+ //! the order of the keys while sorting.
105
+ //!
106
+ //! Stability
107
+ //! --------------------------------------------------
108
+ //!
109
+ //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
110
+ //! are considered equal and appear in the result in the same order as they
111
+ //! appear in the input.
112
+ //!
113
+ //!
114
+ //! Performance Considerations
115
+ //! --------------------------------------------------
116
+ //!
117
+ //! * @granularity
118
+ //!
119
+ //! A Simple Example
120
+ //! --------------------------------------------------
121
+ //!
122
+ //! @blockcollective{BlockRadixSort}
123
+ //!
124
+ //! The code snippet below illustrates a sort of 512 integer keys that
125
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
126
+ //! where each thread owns 4 consecutive items.
127
+ //!
128
+ //! .. tab-set-code::
129
+ //!
130
+ //! .. code-block:: c++
131
+ //!
132
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
133
+ //!
134
+ //! __global__ void kernel(...)
135
+ //! {
136
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
137
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
138
+ //!
139
+ //! // Allocate shared memory for BlockRadixSort
140
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
141
+ //!
142
+ //! // Obtain a segment of consecutive items that are blocked across threads
143
+ //! int thread_keys[4];
144
+ //! ...
145
+ //!
146
+ //! // Collectively sort the keys
147
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
148
+ //!
149
+ //! ...
150
+ //!
151
+ //! .. code-block:: python
152
+ //!
153
+ //! from cuda import coop
154
+ //! from pynvjitlink import patch
155
+ //! patch.patch_numba_linker(lto=True)
156
+ //!
157
+ //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
158
+ //! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
159
+ //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
160
+ //!
161
+ //! @cuda.jit(link=block_radix_sort.files)
162
+ //! def kernel():
163
+ //! Allocate shared memory for radix sort
164
+ //! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
165
+ //!
166
+ //! # Obtain a segment of consecutive items that are blocked across threads
167
+ //! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
168
+ //! # ...
169
+ //!
170
+ //! // Collectively sort the keys
171
+ //! block_radix_sort(temp_storage, thread_keys)
172
+ //! # ...
173
+ //!
174
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
175
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
176
+ //! The corresponding output ``thread_keys`` in those threads will be
177
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
178
+ //!
179
+ //! Re-using dynamically allocating shared memory
180
+ //! --------------------------------------------------
181
+ //!
182
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
183
+ //! BlockReduce and how to re-purpose the same memory region.
184
+ //!
185
+ //! This example can be easily adapted to the storage required by BlockRadixSort.
186
+ //! @endrst
187
+ //!
188
+ //! @tparam KeyT
189
+ //! KeyT type
190
+ //!
191
+ //! @tparam BlockDimX
192
+ //! The thread block length in threads along the X dimension
193
+ //!
194
+ //! @tparam ItemsPerThread
195
+ //! The number of items per thread
196
+ //!
197
+ //! @tparam ValueT
198
+ //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
199
+ //!
200
+ //! @tparam RadixBits
201
+ //! **[optional]** The number of radix bits per digit place (default: 4 bits)
202
+ //!
203
+ //! @tparam MemoizeOuterScan
204
+ //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
205
+ //! reads at the expense of higher register pressure (default: true for architectures SM35 and
206
+ //! newer, false otherwise).
207
+ //!
208
+ //! @tparam InnerScanAlgorithm
209
+ //! **[optional]** The cub::BlockScanAlgorithm algorithm to use
210
+ //! (default: cub::BLOCK_SCAN_WARP_SCANS)
211
+ //!
212
+ //! @tparam SMemConfig
213
+ //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
214
+ //!
215
+ //! @tparam BlockDimY
216
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
217
+ //!
218
+ //! @tparam BlockDimZ
219
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
220
+ //!
221
+ template <typename KeyT,
222
+ int BlockDimX,
223
+ int ItemsPerThread,
224
+ typename ValueT = NullType,
225
+ int RadixBits = 4,
226
+ bool MemoizeOuterScan = true,
227
+ BlockScanAlgorithm InnerScanAlgorithm = BLOCK_SCAN_WARP_SCANS,
228
+ cudaSharedMemConfig SMemConfig = cudaSharedMemBankSizeFourByte,
229
+ int BlockDimY = 1,
230
+ int BlockDimZ = 1>
231
+ class BlockRadixSort
232
+ {
233
+ private:
234
+ /******************************************************************************
235
+ * Constants and type definitions
236
+ ******************************************************************************/
237
+
238
+ enum
239
+ {
240
+ // The thread block size in threads
241
+ BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ,
242
+
243
+ // Whether or not there are values to be trucked along with keys
244
+ KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>,
245
+ };
246
+
247
+ // KeyT traits and unsigned bits type
248
+ using traits = detail::radix::traits_t<KeyT>;
249
+ using bit_ordered_type = typename traits::bit_ordered_type;
250
+ using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
251
+
252
+ /// Ascending BlockRadixRank utility type
253
+ using AscendingBlockRadixRank =
254
+ BlockRadixRank<BlockDimX, RadixBits, false, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
255
+
256
+ /// Descending BlockRadixRank utility type
257
+ using DescendingBlockRadixRank =
258
+ BlockRadixRank<BlockDimX, RadixBits, true, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
259
+
260
+ /// Digit extractor type
261
+ using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
262
+
263
+ /// BlockExchange utility type for keys
264
+ using BlockExchangeKeys = BlockExchange<KeyT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
265
+
266
+ /// BlockExchange utility type for values
267
+ using BlockExchangeValues = BlockExchange<ValueT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
268
+
269
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
270
+ /// Shared memory storage layout type
271
+ union _TempStorage
272
+ {
273
+ typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
274
+ typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
275
+ typename BlockExchangeKeys::TempStorage exchange_keys;
276
+ typename BlockExchangeValues::TempStorage exchange_values;
277
+ };
278
+ #endif // _CCCL_DOXYGEN_INVOKED
279
+
280
+ /******************************************************************************
281
+ * Thread fields
282
+ ******************************************************************************/
283
+
284
+ /// Shared storage reference
285
+ _TempStorage& temp_storage;
286
+
287
+ /// Linear thread-id
288
+ unsigned int linear_tid;
289
+
290
+ /******************************************************************************
291
+ * Utility methods
292
+ ******************************************************************************/
293
+
294
+ /// Internal storage allocator
295
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
296
+ {
297
+ __shared__ _TempStorage private_storage;
298
+ return private_storage;
299
+ }
300
+
301
+ /// Rank keys (specialized for ascending sort)
302
+ template <class DigitExtractorT>
303
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
304
+ RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
305
+ int (&ranks)[ItemsPerThread],
306
+ DigitExtractorT digit_extractor,
307
+ ::cuda::std::false_type /*is_descending*/)
308
+ {
309
+ AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
310
+ }
311
+
312
+ /// Rank keys (specialized for descending sort)
313
+ template <class DigitExtractorT>
314
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
315
+ RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
316
+ int (&ranks)[ItemsPerThread],
317
+ DigitExtractorT digit_extractor,
318
+ ::cuda::std::true_type /*is_descending*/)
319
+ {
320
+ DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
321
+ }
322
+
323
+ /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
324
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
325
+ ValueT (&values)[ItemsPerThread],
326
+ int (&ranks)[ItemsPerThread],
327
+ ::cuda::std::false_type /*is_keys_only*/,
328
+ ::cuda::std::true_type /*is_blocked*/)
329
+ {
330
+ __syncthreads();
331
+
332
+ // Exchange values through shared memory in blocked arrangement
333
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
334
+ }
335
+
336
+ /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
337
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
338
+ ValueT (&values)[ItemsPerThread],
339
+ int (&ranks)[ItemsPerThread],
340
+ ::cuda::std::false_type /*is_keys_only*/,
341
+ ::cuda::std::false_type /*is_blocked*/)
342
+ {
343
+ __syncthreads();
344
+
345
+ // Exchange values through shared memory in blocked arrangement
346
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
347
+ }
348
+
349
+ /// ExchangeValues (specialized for keys-only sort)
350
+ template <bool IS_BLOCKED>
351
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
352
+ ValueT (& /*values*/)[ItemsPerThread],
353
+ int (& /*ranks*/)[ItemsPerThread],
354
+ ::cuda::std::true_type /*is_keys_only*/,
355
+ ::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
356
+ {}
357
+
358
+ /**
359
+ * @brief Sort blocked arrangement
360
+ *
361
+ * @param keys
362
+ * Keys to sort
363
+ *
364
+ * @param values
365
+ * Values to sort
366
+ *
367
+ * @param begin_bit
368
+ * The beginning (least-significant) bit index needed for key comparison
369
+ *
370
+ * @param end_bit
371
+ * The past-the-end (most-significant) bit index needed for key comparison
372
+ *
373
+ * @param is_descending
374
+ * Tag whether is a descending-order sort
375
+ *
376
+ * @param is_keys_only
377
+ * Tag whether is keys-only sort
378
+ */
379
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
380
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
381
+ KeyT (&keys)[ItemsPerThread],
382
+ ValueT (&values)[ItemsPerThread],
383
+ int begin_bit,
384
+ int end_bit,
385
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
386
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
387
+ DecomposerT decomposer = {})
388
+ {
389
+ bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
390
+
391
+ _CCCL_PRAGMA_UNROLL_FULL()
392
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
393
+ {
394
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
395
+ }
396
+
397
+ // Radix sorting passes
398
+ while (true)
399
+ {
400
+ int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
401
+ auto digit_extractor =
402
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
403
+
404
+ // Rank the blocked keys
405
+ int ranks[ItemsPerThread];
406
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
407
+ begin_bit += RadixBits;
408
+
409
+ __syncthreads();
410
+
411
+ // Exchange keys through shared memory in blocked arrangement
412
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
413
+
414
+ // Exchange values through shared memory in blocked arrangement
415
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
416
+
417
+ // Quit if done
418
+ if (begin_bit >= end_bit)
419
+ {
420
+ break;
421
+ }
422
+
423
+ __syncthreads();
424
+ }
425
+
426
+ // Untwiddle bits if necessary
427
+ _CCCL_PRAGMA_UNROLL_FULL()
428
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
429
+ {
430
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
431
+ }
432
+ }
433
+
434
+ public:
435
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
436
+
437
+ /**
438
+ * @brief Sort blocked -> striped arrangement
439
+ *
440
+ * @param keys
441
+ * Keys to sort
442
+ *
443
+ * @param values
444
+ * Values to sort
445
+ *
446
+ * @param begin_bit
447
+ * The beginning (least-significant) bit index needed for key comparison
448
+ *
449
+ * @param end_bit
450
+ * The past-the-end (most-significant) bit index needed for key comparison
451
+ *
452
+ * @param is_descending
453
+ * Tag whether is a descending-order sort
454
+ *
455
+ * @param is_keys_only
456
+ * Tag whether is keys-only sort
457
+ */
458
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
459
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
460
+ KeyT (&keys)[ItemsPerThread],
461
+ ValueT (&values)[ItemsPerThread],
462
+ int begin_bit,
463
+ int end_bit,
464
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
465
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
466
+ DecomposerT decomposer = {})
467
+ {
468
+ bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
469
+
470
+ _CCCL_PRAGMA_UNROLL_FULL()
471
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
472
+ {
473
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
474
+ }
475
+
476
+ // Radix sorting passes
477
+ while (true)
478
+ {
479
+ int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
480
+ auto digit_extractor =
481
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
482
+
483
+ // Rank the blocked keys
484
+ int ranks[ItemsPerThread];
485
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
486
+ begin_bit += RadixBits;
487
+
488
+ __syncthreads();
489
+
490
+ // Check if this is the last pass
491
+ if (begin_bit >= end_bit)
492
+ {
493
+ // Last pass exchanges keys through shared memory in striped arrangement
494
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
495
+
496
+ // Last pass exchanges through shared memory in striped arrangement
497
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::false_type());
498
+
499
+ // Quit
500
+ break;
501
+ }
502
+
503
+ // Exchange keys through shared memory in blocked arrangement
504
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
505
+
506
+ // Exchange values through shared memory in blocked arrangement
507
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
508
+
509
+ __syncthreads();
510
+ }
511
+
512
+ // Untwiddle bits if necessary
513
+ _CCCL_PRAGMA_UNROLL_FULL()
514
+ for (int KEY = 0; KEY < ItemsPerThread; KEY++)
515
+ {
516
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
517
+ }
518
+ }
519
+
520
+ #endif // _CCCL_DOXYGEN_INVOKED
521
+
522
+ /// @smemstorage{BlockRadixSort}
523
+ struct TempStorage : Uninitialized<_TempStorage>
524
+ {};
525
+
526
+ //! @name Collective constructors
527
+ //! @{
528
+
529
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
530
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
531
+ : temp_storage(PrivateStorage())
532
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
533
+ {}
534
+
535
+ /**
536
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
537
+ *
538
+ * @param[in] temp_storage
539
+ * Reference to memory allocation having layout type TempStorage
540
+ */
541
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
542
+ : temp_storage(temp_storage.Alias())
543
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
544
+ {}
545
+
546
+ //! @} end member group
547
+ //! @name Sorting (blocked arrangements)
548
+ //! @{
549
+
550
+ //! @rst
551
+ //! Performs an ascending block-wide radix sort over a
552
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
553
+ //!
554
+ //! - @granularity
555
+ //! - @smemreuse
556
+ //!
557
+ //! Snippet
558
+ //! +++++++
559
+ //!
560
+ //! The code snippet below illustrates a sort of 512 integer keys that
561
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
562
+ //! where each thread owns 4 consecutive keys.
563
+ //!
564
+ //! .. code-block:: c++
565
+ //!
566
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
567
+ //!
568
+ //! __global__ void ExampleKernel(...)
569
+ //! {
570
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
571
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
572
+ //!
573
+ //! // Allocate shared memory for BlockRadixSort
574
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
575
+ //!
576
+ //! // Obtain a segment of consecutive items that are blocked across threads
577
+ //! int thread_keys[4];
578
+ //! ...
579
+ //!
580
+ //! // Collectively sort the keys
581
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
582
+ //!
583
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
584
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
585
+ //! The corresponding output ``thread_keys`` in those threads will be
586
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
587
+ //! @endrst
588
+ //!
589
+ //! @param[in,out] keys
590
+ //! Keys to sort
591
+ //!
592
+ //! @param[in] begin_bit
593
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
594
+ //!
595
+ //! @param[in] end_bit
596
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
597
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
598
+ Sort(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
599
+ {
600
+ NullType values[ItemsPerThread];
601
+
602
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
603
+ }
604
+
605
+ //! @rst
606
+ //! Performs an ascending block-wide radix sort over a
607
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
608
+ //!
609
+ //! * @granularity
610
+ //! * @smemreuse
611
+ //!
612
+ //! Snippet
613
+ //! ==========================================================================
614
+ //!
615
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
616
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
617
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
618
+ //! tuple of references to relevant members of the key.
619
+ //!
620
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
621
+ //! :language: c++
622
+ //! :dedent:
623
+ //! :start-after: example-begin custom-type
624
+ //! :end-before: example-end custom-type
625
+ //!
626
+ //! The code snippet below illustrates a sort of 2 keys that
627
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
628
+ //! where each thread owns 1 key.
629
+ //!
630
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
631
+ //! :language: c++
632
+ //! :dedent:
633
+ //! :start-after: example-begin keys-bits
634
+ //! :end-before: example-end keys-bits
635
+ //!
636
+ //! @endrst
637
+ //!
638
+ //! @tparam DecomposerT
639
+ //! **[inferred]** Type of a callable object responsible for decomposing a
640
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
641
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
642
+ //! The leftmost element of the tuple is considered the most significant.
643
+ //! The call operator must not modify members of the key.
644
+ //!
645
+ //! @param[in,out] keys
646
+ //! Keys to sort
647
+ //!
648
+ //! @param decomposer
649
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
650
+ //! references to its constituent arithmetic types. The leftmost element of
651
+ //! the tuple is considered the most significant. The call operator must not
652
+ //! modify members of the key.
653
+ //!
654
+ //! @param[in] begin_bit
655
+ //! The least-significant bit index (inclusive) needed for
656
+ //! key comparison
657
+ //!
658
+ //! @param[in] end_bit
659
+ //! The most-significant bit index (exclusive) needed for key
660
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
661
+ template <class DecomposerT>
662
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
663
+ ::cuda::std::enable_if_t< //
664
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
665
+ Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
666
+ {
667
+ NullType values[ItemsPerThread];
668
+
669
+ SortBlocked(
670
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
671
+ }
672
+
673
+ //! @rst
674
+ //! Performs an ascending block-wide radix sort over a
675
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
676
+ //!
677
+ //! * @granularity
678
+ //! * @smemreuse
679
+ //!
680
+ //! Snippet
681
+ //! ==========================================================================
682
+ //!
683
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
684
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
685
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
686
+ //! tuple of references to relevant members of the key.
687
+ //!
688
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
689
+ //! :language: c++
690
+ //! :dedent:
691
+ //! :start-after: example-begin custom-type
692
+ //! :end-before: example-end custom-type
693
+ //!
694
+ //! The code snippet below illustrates a sort of 6 keys that
695
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
696
+ //! where each thread owns 3 consecutive keys.
697
+ //!
698
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
699
+ //! :language: c++
700
+ //! :dedent:
701
+ //! :start-after: example-begin keys
702
+ //! :end-before: example-end keys
703
+ //!
704
+ //! @endrst
705
+ //!
706
+ //! @tparam DecomposerT
707
+ //! **[inferred]** Type of a callable object responsible for decomposing a
708
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
709
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
710
+ //! The leftmost element of the tuple is considered the most significant.
711
+ //! The call operator must not modify members of the key.
712
+ //!
713
+ //! @param[in,out] keys
714
+ //! Keys to sort
715
+ //!
716
+ //! @param decomposer
717
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
718
+ //! references to its constituent arithmetic types. The leftmost element of
719
+ //! the tuple is considered the most significant. The call operator must not
720
+ //! modify members of the key.
721
+ template <class DecomposerT>
722
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
723
+ ::cuda::std::enable_if_t< //
724
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
725
+ Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
726
+ {
727
+ Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
728
+ }
729
+
730
+ //! @rst
731
+ //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
732
+ //! of keys and values.
733
+ //!
734
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
735
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
736
+ //! with a temporary value array that enumerates the key indices. The reordered indices
737
+ //! can then be used as a gather-vector for exchanging other associated tile data through
738
+ //! shared memory.
739
+ //! - @granularity
740
+ //! - @smemreuse
741
+ //!
742
+ //! Snippet
743
+ //! +++++++
744
+ //!
745
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
746
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
747
+ //! where each thread owns 4 consecutive pairs.
748
+ //!
749
+ //! .. code-block:: c++
750
+ //!
751
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
752
+ //!
753
+ //! __global__ void ExampleKernel(...)
754
+ //! {
755
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
756
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
757
+ //!
758
+ //! // Allocate shared memory for BlockRadixSort
759
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
760
+ //!
761
+ //! // Obtain a segment of consecutive items that are blocked across threads
762
+ //! int thread_keys[4];
763
+ //! int thread_values[4];
764
+ //! ...
765
+ //!
766
+ //! // Collectively sort the keys and values among block threads
767
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
768
+ //!
769
+ //! @endcode
770
+ //! @par
771
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
772
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
773
+ //! corresponding output ``thread_keys`` in those threads will be
774
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
775
+ //!
776
+ //! @endrst
777
+ //!
778
+ //! @param[in,out] keys
779
+ //! Keys to sort
780
+ //!
781
+ //! @param[in,out] values
782
+ //! Values to sort
783
+ //!
784
+ //! @param[in] begin_bit
785
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
786
+ //!
787
+ //! @param[in] end_bit
788
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
789
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(
790
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
791
+ {
792
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
793
+ }
794
+
795
+ //! @rst
796
+ //! Performs an ascending block-wide radix sort over a
797
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
798
+ //!
799
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
800
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
801
+ //! with a temporary value array that enumerates the key indices. The reordered indices
802
+ //! can then be used as a gather-vector for exchanging other associated tile data through
803
+ //! shared memory.
804
+ //! * @granularity
805
+ //! * @smemreuse
806
+ //!
807
+ //! Snippet
808
+ //! ==========================================================================
809
+ //!
810
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
811
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
812
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
813
+ //! tuple of references to relevant members of the key.
814
+ //!
815
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
816
+ //! :language: c++
817
+ //! :dedent:
818
+ //! :start-after: example-begin custom-type
819
+ //! :end-before: example-end custom-type
820
+ //!
821
+ //! The code snippet below illustrates a sort of 2 keys and values that
822
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
823
+ //! where each thread owns 1 pair.
824
+ //!
825
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
826
+ //! :language: c++
827
+ //! :dedent:
828
+ //! :start-after: example-begin pairs-bits
829
+ //! :end-before: example-end pairs-bits
830
+ //!
831
+ //! @endrst
832
+ //!
833
+ //! @tparam DecomposerT
834
+ //! **[inferred]** Type of a callable object responsible for decomposing a
835
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
836
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
837
+ //! The leftmost element of the tuple is considered the most significant.
838
+ //! The call operator must not modify members of the key.
839
+ //!
840
+ //! @param[in,out] keys
841
+ //! Keys to sort
842
+ //!
843
+ //! @param[in,out] values
844
+ //! Values to sort
845
+ //!
846
+ //! @param decomposer
847
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
848
+ //! references to its constituent arithmetic types. The leftmost element of
849
+ //! the tuple is considered the most significant. The call operator must not
850
+ //! modify members of the key.
851
+ //!
852
+ //! @param[in] begin_bit
853
+ //! The least-significant bit index (inclusive) needed for
854
+ //! key comparison
855
+ //!
856
+ //! @param[in] end_bit
857
+ //! The most-significant bit index (exclusive) needed for key
858
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
859
+ template <class DecomposerT>
860
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
861
+ ::cuda::std::enable_if_t< //
862
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
863
+ Sort(
864
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
865
+ {
866
+ SortBlocked(
867
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
868
+ }
869
+
870
+ //! @rst
871
+ //! Performs an ascending block-wide radix sort over a
872
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
873
+ //!
874
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
875
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
876
+ //! with a temporary value array that enumerates the key indices. The reordered indices
877
+ //! can then be used as a gather-vector for exchanging other associated tile data through
878
+ //! shared memory.
879
+ //! * @granularity
880
+ //! * @smemreuse
881
+ //!
882
+ //! Snippet
883
+ //! ==========================================================================
884
+ //!
885
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
886
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
887
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
888
+ //! tuple of references to relevant members of the key.
889
+ //!
890
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
891
+ //! :language: c++
892
+ //! :dedent:
893
+ //! :start-after: example-begin custom-type
894
+ //! :end-before: example-end custom-type
895
+ //!
896
+ //! The code snippet below illustrates a sort of 6 keys and values that
897
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
898
+ //! where each thread owns 3 consecutive pairs.
899
+ //!
900
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
901
+ //! :language: c++
902
+ //! :dedent:
903
+ //! :start-after: example-begin pairs
904
+ //! :end-before: example-end pairs
905
+ //!
906
+ //! @endrst
907
+ //!
908
+ //! @tparam DecomposerT
909
+ //! **[inferred]** Type of a callable object responsible for decomposing a
910
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
911
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
912
+ //! The leftmost element of the tuple is considered the most significant.
913
+ //! The call operator must not modify members of the key.
914
+ //!
915
+ //! @param[in,out] keys
916
+ //! Keys to sort
917
+ //!
918
+ //! @param[in,out] values
919
+ //! Values to sort
920
+ //!
921
+ //! @param decomposer
922
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
923
+ //! references to its constituent arithmetic types. The leftmost element of
924
+ //! the tuple is considered the most significant. The call operator must not
925
+ //! modify members of the key.
926
+ template <class DecomposerT>
927
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
928
+ ::cuda::std::enable_if_t< //
929
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
930
+ Sort(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
931
+ {
932
+ Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
933
+ }
934
+
935
+ //! @rst
936
+ //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
937
+ //! of keys.
938
+ //!
939
+ //! - @granularity
940
+ //! - @smemreuse
941
+ //!
942
+ //! Snippet
943
+ //! +++++++
944
+ //!
945
+ //! The code snippet below illustrates a sort of 512 integer keys that
946
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
947
+ //! where each thread owns 4 consecutive keys.
948
+ //!
949
+ //! .. code-block:: c++
950
+ //!
951
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
952
+ //!
953
+ //! __global__ void ExampleKernel(...)
954
+ //! {
955
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
956
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
957
+ //!
958
+ //! // Allocate shared memory for BlockRadixSort
959
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
960
+ //!
961
+ //! // Obtain a segment of consecutive items that are blocked across threads
962
+ //! int thread_keys[4];
963
+ //! ...
964
+ //!
965
+ //! // Collectively sort the keys
966
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
967
+ //!
968
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
969
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
970
+ //! The corresponding output ``thread_keys`` in those threads will be
971
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
972
+ //!
973
+ //! @endrst
974
+ //!
975
+ //! @param[in,out] keys
976
+ //! Keys to sort
977
+ //!
978
+ //! @param[in] begin_bit
979
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
980
+ //!
981
+ //! @param[in] end_bit
982
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
983
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
984
+ SortDescending(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
985
+ {
986
+ NullType values[ItemsPerThread];
987
+
988
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
989
+ }
990
+
991
+ //! @rst
992
+ //! Performs a descending block-wide radix sort over a
993
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
994
+ //!
995
+ //! * @granularity
996
+ //! * @smemreuse
997
+ //!
998
+ //! Snippet
999
+ //! ==========================================================================
1000
+ //!
1001
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1002
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1003
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1004
+ //! tuple of references to relevant members of the key.
1005
+ //!
1006
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1007
+ //! :language: c++
1008
+ //! :dedent:
1009
+ //! :start-after: example-begin custom-type
1010
+ //! :end-before: example-end custom-type
1011
+ //!
1012
+ //! The code snippet below illustrates a sort of 2 keys that
1013
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1014
+ //! where each thread owns 1 key.
1015
+ //!
1016
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1017
+ //! :language: c++
1018
+ //! :dedent:
1019
+ //! :start-after: example-begin keys-descending-bits
1020
+ //! :end-before: example-end keys-descending-bits
1021
+ //!
1022
+ //! @endrst
1023
+ //!
1024
+ //! @tparam DecomposerT
1025
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1026
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1027
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1028
+ //! The leftmost element of the tuple is considered the most significant.
1029
+ //! The call operator must not modify members of the key.
1030
+ //!
1031
+ //! @param[in,out] keys
1032
+ //! Keys to sort
1033
+ //!
1034
+ //! @param decomposer
1035
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1036
+ //! references to its constituent arithmetic types. The leftmost element of
1037
+ //! the tuple is considered the most significant. The call operator must not
1038
+ //! modify members of the key.
1039
+ //!
1040
+ //! @param[in] begin_bit
1041
+ //! The least-significant bit index (inclusive) needed for
1042
+ //! key comparison
1043
+ //!
1044
+ //! @param[in] end_bit
1045
+ //! The most-significant bit index (exclusive) needed for key
1046
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1047
+ template <class DecomposerT>
1048
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1049
+ ::cuda::std::enable_if_t< //
1050
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1051
+ SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1052
+ {
1053
+ NullType values[ItemsPerThread];
1054
+
1055
+ SortBlocked(
1056
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1057
+ }
1058
+
1059
+ //! @rst
1060
+ //! Performs a descending block-wide radix sort over a
1061
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1062
+ //!
1063
+ //! * @granularity
1064
+ //! * @smemreuse
1065
+ //!
1066
+ //! Snippet
1067
+ //! ==========================================================================
1068
+ //!
1069
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1070
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1071
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1072
+ //! tuple of references to relevant members of the key.
1073
+ //!
1074
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1075
+ //! :language: c++
1076
+ //! :dedent:
1077
+ //! :start-after: example-begin custom-type
1078
+ //! :end-before: example-end custom-type
1079
+ //!
1080
+ //! The code snippet below illustrates a sort of 6 keys that
1081
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1082
+ //! where each thread owns 3 consecutive keys.
1083
+ //!
1084
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1085
+ //! :language: c++
1086
+ //! :dedent:
1087
+ //! :start-after: example-begin keys-descending
1088
+ //! :end-before: example-end keys-descending
1089
+ //!
1090
+ //! @endrst
1091
+ //!
1092
+ //! @tparam DecomposerT
1093
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1094
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1095
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1096
+ //! The leftmost element of the tuple is considered the most significant.
1097
+ //! The call operator must not modify members of the key.
1098
+ //!
1099
+ //! @param[in,out] keys
1100
+ //! Keys to sort
1101
+ //!
1102
+ //! @param decomposer
1103
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1104
+ //! references to its constituent arithmetic types. The leftmost element of
1105
+ //! the tuple is considered the most significant. The call operator must not
1106
+ //! modify members of the key.
1107
+ template <class DecomposerT>
1108
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1109
+ ::cuda::std::enable_if_t< //
1110
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1111
+ SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1112
+ {
1113
+ NullType values[ItemsPerThread];
1114
+
1115
+ SortBlocked(
1116
+ keys,
1117
+ values,
1118
+ 0,
1119
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1120
+ ::cuda::std::true_type(),
1121
+ detail::bool_constant_v<KEYS_ONLY>,
1122
+ decomposer);
1123
+ }
1124
+
1125
+ //! @rst
1126
+ //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1127
+ //! of keys and values.
1128
+ //!
1129
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1130
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1131
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1132
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1133
+ //! shared memory.
1134
+ //! - @granularity
1135
+ //! - @smemreuse
1136
+ //!
1137
+ //! Snippet
1138
+ //! +++++++
1139
+ //!
1140
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1141
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1142
+ //! where each thread owns 4 consecutive pairs.
1143
+ //!
1144
+ //! .. code-block:: c++
1145
+ //!
1146
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1147
+ //!
1148
+ //! __global__ void ExampleKernel(...)
1149
+ //! {
1150
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1151
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1152
+ //!
1153
+ //! // Allocate shared memory for BlockRadixSort
1154
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1155
+ //!
1156
+ //! // Obtain a segment of consecutive items that are blocked across threads
1157
+ //! int thread_keys[4];
1158
+ //! int thread_values[4];
1159
+ //! ...
1160
+ //!
1161
+ //! // Collectively sort the keys and values among block threads
1162
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
1163
+ //!
1164
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1165
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
1166
+ //! corresponding output ``thread_keys`` in those threads will be
1167
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1168
+ //!
1169
+ //! @endrst
1170
+ //!
1171
+ //! @param[in,out] keys
1172
+ //! Keys to sort
1173
+ //!
1174
+ //! @param[in,out] values
1175
+ //! Values to sort
1176
+ //!
1177
+ //! @param[in] begin_bit
1178
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1179
+ //!
1180
+ //! @param[in] end_bit
1181
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1182
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
1183
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1184
+ {
1185
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1186
+ }
1187
+
1188
+ //! @rst
1189
+ //! Performs a descending block-wide radix sort over a
1190
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1191
+ //!
1192
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1193
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1194
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1195
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1196
+ //! shared memory.
1197
+ //! * @granularity
1198
+ //! * @smemreuse
1199
+ //!
1200
+ //! Snippet
1201
+ //! ==========================================================================
1202
+ //!
1203
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1204
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1205
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1206
+ //! tuple of references to relevant members of the key.
1207
+ //!
1208
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1209
+ //! :language: c++
1210
+ //! :dedent:
1211
+ //! :start-after: example-begin custom-type
1212
+ //! :end-before: example-end custom-type
1213
+ //!
1214
+ //! The code snippet below illustrates a sort of 2 pairs that
1215
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1216
+ //! where each thread owns 1 pair.
1217
+ //!
1218
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1219
+ //! :language: c++
1220
+ //! :dedent:
1221
+ //! :start-after: example-begin pairs-descending-bits
1222
+ //! :end-before: example-end pairs-descending-bits
1223
+ //!
1224
+ //! @endrst
1225
+ //!
1226
+ //! @tparam DecomposerT
1227
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1228
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1229
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1230
+ //! The leftmost element of the tuple is considered the most significant.
1231
+ //! The call operator must not modify members of the key.
1232
+ //!
1233
+ //! @param[in,out] keys
1234
+ //! Keys to sort
1235
+ //!
1236
+ //! @param[in,out] values
1237
+ //! Values to sort
1238
+ //!
1239
+ //! @param decomposer
1240
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1241
+ //! references to its constituent arithmetic types. The leftmost element of
1242
+ //! the tuple is considered the most significant. The call operator must not
1243
+ //! modify members of the key.
1244
+ //!
1245
+ //! @param[in] begin_bit
1246
+ //! The least-significant bit index (inclusive) needed for
1247
+ //! key comparison
1248
+ //!
1249
+ //! @param[in] end_bit
1250
+ //! The most-significant bit index (exclusive) needed for key
1251
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1252
+ template <class DecomposerT>
1253
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1254
+ ::cuda::std::enable_if_t< //
1255
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1256
+ SortDescending(
1257
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1258
+ {
1259
+ SortBlocked(
1260
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1261
+ }
1262
+
1263
+ //! @rst
1264
+ //! Performs a descending block-wide radix sort over a
1265
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1266
+ //!
1267
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1268
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1269
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1270
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1271
+ //! shared memory.
1272
+ //! * @granularity
1273
+ //! * @smemreuse
1274
+ //!
1275
+ //! Snippet
1276
+ //! ==========================================================================
1277
+ //!
1278
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1279
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1280
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1281
+ //! tuple of references to relevant members of the key.
1282
+ //!
1283
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1284
+ //! :language: c++
1285
+ //! :dedent:
1286
+ //! :start-after: example-begin custom-type
1287
+ //! :end-before: example-end custom-type
1288
+ //!
1289
+ //! The code snippet below illustrates a sort of 6 keys and values that
1290
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1291
+ //! where each thread owns 3 consecutive pairs.
1292
+ //!
1293
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1294
+ //! :language: c++
1295
+ //! :dedent:
1296
+ //! :start-after: example-begin pairs-descending
1297
+ //! :end-before: example-end pairs-descending
1298
+ //!
1299
+ //! @endrst
1300
+ //!
1301
+ //! @tparam DecomposerT
1302
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1303
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1304
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1305
+ //! The leftmost element of the tuple is considered the most significant.
1306
+ //! The call operator must not modify members of the key.
1307
+ //!
1308
+ //! @param[in,out] keys
1309
+ //! Keys to sort
1310
+ //!
1311
+ //! @param[in,out] values
1312
+ //! Values to sort
1313
+ //!
1314
+ //! @param decomposer
1315
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1316
+ //! references to its constituent arithmetic types. The leftmost element of
1317
+ //! the tuple is considered the most significant. The call operator must not
1318
+ //! modify members of the key.
1319
+ template <class DecomposerT>
1320
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1321
+ ::cuda::std::enable_if_t< //
1322
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1323
+ SortDescending(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
1324
+ {
1325
+ SortBlocked(
1326
+ keys,
1327
+ values,
1328
+ 0,
1329
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1330
+ ::cuda::std::true_type(),
1331
+ detail::bool_constant_v<KEYS_ONLY>,
1332
+ decomposer);
1333
+ }
1334
+
1335
+ //! @} end member group
1336
+ //! @name Sorting (blocked arrangement -> striped arrangement)
1337
+ //! @{
1338
+
1339
+ //! @rst
1340
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
1341
+ //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1342
+ //!
1343
+ //! - @granularity
1344
+ //! - @smemreuse
1345
+ //!
1346
+ //! Snippet
1347
+ //! +++++++
1348
+ //!
1349
+ //! The code snippet below illustrates a sort of 512 integer keys that
1350
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1351
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1352
+ //!
1353
+ //! .. code-block:: c++
1354
+ //!
1355
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1356
+ //!
1357
+ //! __global__ void ExampleKernel(...)
1358
+ //! {
1359
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1360
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1361
+ //!
1362
+ //! // Allocate shared memory for BlockRadixSort
1363
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1364
+ //!
1365
+ //! // Obtain a segment of consecutive items that are blocked across threads
1366
+ //! int thread_keys[4];
1367
+ //! ...
1368
+ //!
1369
+ //! // Collectively sort the keys
1370
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1371
+ //!
1372
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1373
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1374
+ //! The corresponding output ``thread_keys`` in those threads will be
1375
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1376
+ //!
1377
+ //! @endrst
1378
+ //!
1379
+ //! @param[in,out] keys
1380
+ //! Keys to sort
1381
+ //!
1382
+ //! @param[in] begin_bit
1383
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1384
+ //!
1385
+ //! @param[in] end_bit
1386
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1387
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1388
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1389
+ {
1390
+ NullType values[ItemsPerThread];
1391
+
1392
+ SortBlockedToStriped(
1393
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1394
+ }
1395
+
1396
+ //! @rst
1397
+ //! Performs an ascending block-wide radix sort over a
1398
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1399
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1400
+ //!
1401
+ //! * @granularity
1402
+ //! * @smemreuse
1403
+ //!
1404
+ //! Snippet
1405
+ //! ==========================================================================
1406
+ //!
1407
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1408
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1409
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1410
+ //! tuple of references to relevant members of the key.
1411
+ //!
1412
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1413
+ //! :language: c++
1414
+ //! :dedent:
1415
+ //! :start-after: example-begin custom-type
1416
+ //! :end-before: example-end custom-type
1417
+ //!
1418
+ //! The code snippet below illustrates a sort of 4 keys that
1419
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1420
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1421
+ //!
1422
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1423
+ //! :language: c++
1424
+ //! :dedent:
1425
+ //! :start-after: example-begin keys-striped-bits
1426
+ //! :end-before: example-end keys-striped-bits
1427
+ //!
1428
+ //! @endrst
1429
+ //!
1430
+ //! @tparam DecomposerT
1431
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1432
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1433
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1434
+ //! The leftmost element of the tuple is considered the most significant.
1435
+ //! The call operator must not modify members of the key.
1436
+ //!
1437
+ //! @param[in,out] keys
1438
+ //! Keys to sort
1439
+ //!
1440
+ //! @param decomposer
1441
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1442
+ //! references to its constituent arithmetic types. The leftmost element of
1443
+ //! the tuple is considered the most significant. The call operator must not
1444
+ //! modify members of the key.
1445
+ //!
1446
+ //! @param[in] begin_bit
1447
+ //! The least-significant bit index (inclusive) needed for
1448
+ //! key comparison
1449
+ //!
1450
+ //! @param[in] end_bit
1451
+ //! The most-significant bit index (exclusive) needed for key
1452
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1453
+ template <class DecomposerT>
1454
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1455
+ ::cuda::std::enable_if_t< //
1456
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1457
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1458
+ {
1459
+ NullType values[ItemsPerThread];
1460
+
1461
+ SortBlockedToStriped(
1462
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1463
+ }
1464
+
1465
+ //! @rst
1466
+ //! Performs an ascending block-wide radix sort over a
1467
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1468
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1469
+ //!
1470
+ //! * @granularity
1471
+ //! * @smemreuse
1472
+ //!
1473
+ //! Snippet
1474
+ //! ==========================================================================
1475
+ //!
1476
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1477
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1478
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1479
+ //! tuple of references to relevant members of the key.
1480
+ //!
1481
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1482
+ //! :language: c++
1483
+ //! :dedent:
1484
+ //! :start-after: example-begin custom-type
1485
+ //! :end-before: example-end custom-type
1486
+ //!
1487
+ //! The code snippet below illustrates a sort of 6 keys that
1488
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1489
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1490
+ //!
1491
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1492
+ //! :language: c++
1493
+ //! :dedent:
1494
+ //! :start-after: example-begin keys-striped
1495
+ //! :end-before: example-end keys-striped
1496
+ //!
1497
+ //! @endrst
1498
+ //!
1499
+ //! @tparam DecomposerT
1500
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1501
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1502
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1503
+ //! The leftmost element of the tuple is considered the most significant.
1504
+ //! The call operator must not modify members of the key.
1505
+ //!
1506
+ //! @param[in,out] keys
1507
+ //! Keys to sort
1508
+ //!
1509
+ //! @param decomposer
1510
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1511
+ //! references to its constituent arithmetic types. The leftmost element of
1512
+ //! the tuple is considered the most significant. The call operator must not
1513
+ //! modify members of the key.
1514
+ template <class DecomposerT>
1515
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1516
+ ::cuda::std::enable_if_t< //
1517
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1518
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1519
+ {
1520
+ NullType values[ItemsPerThread];
1521
+
1522
+ SortBlockedToStriped(
1523
+ keys,
1524
+ values,
1525
+ 0,
1526
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1527
+ ::cuda::std::false_type(),
1528
+ detail::bool_constant_v<KEYS_ONLY>,
1529
+ decomposer);
1530
+ }
1531
+
1532
+ //! @rst
1533
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
1534
+ //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1535
+ //!
1536
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1537
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1538
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1539
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1540
+ //! shared memory.
1541
+ //! - @granularity
1542
+ //! - @smemreuse
1543
+ //!
1544
+ //! Snippet
1545
+ //! +++++++
1546
+ //!
1547
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1548
+ //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
1549
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1550
+ //!
1551
+ //! .. code-block:: c++
1552
+ //!
1553
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1554
+ //!
1555
+ //! __global__ void ExampleKernel(...)
1556
+ //! {
1557
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1558
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1559
+ //!
1560
+ //! // Allocate shared memory for BlockRadixSort
1561
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1562
+ //!
1563
+ //! // Obtain a segment of consecutive items that are blocked across threads
1564
+ //! int thread_keys[4];
1565
+ //! int thread_values[4];
1566
+ //! ...
1567
+ //!
1568
+ //! // Collectively sort the keys and values among block threads
1569
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1570
+ //!
1571
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1572
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1573
+ //! The corresponding output ``thread_keys`` in those threads will be
1574
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1575
+ //!
1576
+ //! @endrst
1577
+ //!
1578
+ //! @param[in,out] keys
1579
+ //! Keys to sort
1580
+ //!
1581
+ //! @param[in,out] values
1582
+ //! Values to sort
1583
+ //!
1584
+ //! @param[in] begin_bit
1585
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1586
+ //!
1587
+ //! @param[in] end_bit
1588
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1589
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
1590
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1591
+ {
1592
+ SortBlockedToStriped(
1593
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1594
+ }
1595
+
1596
+ //! @rst
1597
+ //! Performs an ascending block-wide radix sort over a
1598
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1599
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1600
+ //!
1601
+ //! * @granularity
1602
+ //! * @smemreuse
1603
+ //!
1604
+ //! Snippet
1605
+ //! ==========================================================================
1606
+ //!
1607
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1608
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1609
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1610
+ //! tuple of references to relevant members of the key.
1611
+ //!
1612
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1613
+ //! :language: c++
1614
+ //! :dedent:
1615
+ //! :start-after: example-begin custom-type
1616
+ //! :end-before: example-end custom-type
1617
+ //!
1618
+ //! The code snippet below illustrates a sort of 4 pairs that
1619
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1620
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
1621
+ //!
1622
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1623
+ //! :language: c++
1624
+ //! :dedent:
1625
+ //! :start-after: example-begin pairs-striped-bits
1626
+ //! :end-before: example-end pairs-striped-bits
1627
+ //!
1628
+ //! @endrst
1629
+ //!
1630
+ //! @tparam DecomposerT
1631
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1632
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1633
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1634
+ //! The leftmost element of the tuple is considered the most significant.
1635
+ //! The call operator must not modify members of the key.
1636
+ //!
1637
+ //! @param[in,out] keys
1638
+ //! Keys to sort
1639
+ //!
1640
+ //! @param[in,out] values
1641
+ //! Values to sort
1642
+ //!
1643
+ //! @param decomposer
1644
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1645
+ //! references to its constituent arithmetic types. The leftmost element of
1646
+ //! the tuple is considered the most significant. The call operator must not
1647
+ //! modify members of the key.
1648
+ //!
1649
+ //! @param[in] begin_bit
1650
+ //! The least-significant bit index (inclusive) needed for
1651
+ //! key comparison
1652
+ //!
1653
+ //! @param[in] end_bit
1654
+ //! The most-significant bit index (exclusive) needed for key
1655
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1656
+ template <class DecomposerT>
1657
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1658
+ ::cuda::std::enable_if_t< //
1659
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1660
+ SortBlockedToStriped(
1661
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1662
+ {
1663
+ SortBlockedToStriped(
1664
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1665
+ }
1666
+
1667
+ //! @rst
1668
+ //! Performs an ascending block-wide radix sort over a
1669
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1670
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1671
+ //!
1672
+ //! * @granularity
1673
+ //! * @smemreuse
1674
+ //!
1675
+ //! Snippet
1676
+ //! ==========================================================================
1677
+ //!
1678
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1679
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1680
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1681
+ //! tuple of references to relevant members of the key.
1682
+ //!
1683
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1684
+ //! :language: c++
1685
+ //! :dedent:
1686
+ //! :start-after: example-begin custom-type
1687
+ //! :end-before: example-end custom-type
1688
+ //!
1689
+ //! The code snippet below illustrates a sort of 6 pairs that
1690
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1691
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
1692
+ //!
1693
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1694
+ //! :language: c++
1695
+ //! :dedent:
1696
+ //! :start-after: example-begin pairs-striped
1697
+ //! :end-before: example-end pairs-striped
1698
+ //!
1699
+ //! @endrst
1700
+ //!
1701
+ //! @tparam DecomposerT
1702
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1703
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1704
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1705
+ //! The leftmost element of the tuple is considered the most significant.
1706
+ //! The call operator must not modify members of the key.
1707
+ //!
1708
+ //! @param[in,out] keys
1709
+ //! Keys to sort
1710
+ //!
1711
+ //! @param[in,out] values
1712
+ //! Values to sort
1713
+ //!
1714
+ //! @param decomposer
1715
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1716
+ //! references to its constituent arithmetic types. The leftmost element of
1717
+ //! the tuple is considered the most significant. The call operator must not
1718
+ //! modify members of the key.
1719
+ template <class DecomposerT>
1720
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1721
+ ::cuda::std::enable_if_t< //
1722
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1723
+ SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
1724
+ {
1725
+ SortBlockedToStriped(
1726
+ keys,
1727
+ values,
1728
+ 0,
1729
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1730
+ ::cuda::std::false_type(),
1731
+ detail::bool_constant_v<KEYS_ONLY>,
1732
+ decomposer);
1733
+ }
1734
+
1735
+ //! @rst
1736
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1737
+ //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1738
+ //!
1739
+ //! - @granularity
1740
+ //! - @smemreuse
1741
+ //!
1742
+ //! Snippet
1743
+ //! +++++++
1744
+ //!
1745
+ //! The code snippet below illustrates a sort of 512 integer keys that
1746
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1747
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1748
+ //!
1749
+ //! .. code-block:: c++
1750
+ //!
1751
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1752
+ //!
1753
+ //! __global__ void ExampleKernel(...)
1754
+ //! {
1755
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1756
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1757
+ //!
1758
+ //! // Allocate shared memory for BlockRadixSort
1759
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1760
+ //!
1761
+ //! // Obtain a segment of consecutive items that are blocked across threads
1762
+ //! int thread_keys[4];
1763
+ //! ...
1764
+ //!
1765
+ //! // Collectively sort the keys
1766
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1767
+ //!
1768
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1769
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1770
+ //! The corresponding output ``thread_keys`` in those threads will be
1771
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1772
+ //!
1773
+ //! @endrst
1774
+ //!
1775
+ //! @param[in,out] keys
1776
+ //! Keys to sort
1777
+ //!
1778
+ //! @param[in] begin_bit
1779
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1780
+ //!
1781
+ //! @param[in] end_bit
1782
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1783
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1784
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1785
+ {
1786
+ NullType values[ItemsPerThread];
1787
+
1788
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1789
+ }
1790
+
1791
+ //! @rst
1792
+ //! Performs a descending block-wide radix sort over a
1793
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1794
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1795
+ //!
1796
+ //! * @granularity
1797
+ //! * @smemreuse
1798
+ //!
1799
+ //! Snippet
1800
+ //! ==========================================================================
1801
+ //!
1802
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1803
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1804
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1805
+ //! tuple of references to relevant members of the key.
1806
+ //!
1807
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1808
+ //! :language: c++
1809
+ //! :dedent:
1810
+ //! :start-after: example-begin custom-type
1811
+ //! :end-before: example-end custom-type
1812
+ //!
1813
+ //! The code snippet below illustrates a sort of 4 keys that
1814
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1815
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1816
+ //!
1817
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1818
+ //! :language: c++
1819
+ //! :dedent:
1820
+ //! :start-after: example-begin keys-striped-descending-bits
1821
+ //! :end-before: example-end keys-striped-descending-bits
1822
+ //!
1823
+ //! @endrst
1824
+ //!
1825
+ //! @tparam DecomposerT
1826
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1827
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1828
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1829
+ //! The leftmost element of the tuple is considered the most significant.
1830
+ //! The call operator must not modify members of the key.
1831
+ //!
1832
+ //! @param[in,out] keys
1833
+ //! Keys to sort
1834
+ //!
1835
+ //! @param decomposer
1836
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1837
+ //! references to its constituent arithmetic types. The leftmost element of
1838
+ //! the tuple is considered the most significant. The call operator must not
1839
+ //! modify members of the key.
1840
+ //!
1841
+ //! @param[in] begin_bit
1842
+ //! The least-significant bit index (inclusive) needed for
1843
+ //! key comparison
1844
+ //!
1845
+ //! @param[in] end_bit
1846
+ //! The most-significant bit index (exclusive) needed for key
1847
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1848
+ template <class DecomposerT>
1849
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1850
+ ::cuda::std::enable_if_t< //
1851
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1852
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
1853
+ {
1854
+ NullType values[ItemsPerThread];
1855
+
1856
+ SortBlockedToStriped(
1857
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1858
+ }
1859
+
1860
+ //! @rst
1861
+ //! Performs a descending block-wide radix sort over a
1862
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1863
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1864
+ //!
1865
+ //! * @granularity
1866
+ //! * @smemreuse
1867
+ //!
1868
+ //! Snippet
1869
+ //! ==========================================================================
1870
+ //!
1871
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1872
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1873
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1874
+ //! tuple of references to relevant members of the key.
1875
+ //!
1876
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1877
+ //! :language: c++
1878
+ //! :dedent:
1879
+ //! :start-after: example-begin custom-type
1880
+ //! :end-before: example-end custom-type
1881
+ //!
1882
+ //! The code snippet below illustrates a sort of 6 keys that
1883
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1884
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1885
+ //!
1886
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1887
+ //! :language: c++
1888
+ //! :dedent:
1889
+ //! :start-after: example-begin keys-striped-descending
1890
+ //! :end-before: example-end keys-striped-descending
1891
+ //!
1892
+ //! @endrst
1893
+ //!
1894
+ //! @tparam DecomposerT
1895
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1896
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1897
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1898
+ //! The leftmost element of the tuple is considered the most significant.
1899
+ //! The call operator must not modify members of the key.
1900
+ //!
1901
+ //! @param[in,out] keys
1902
+ //! Keys to sort
1903
+ //!
1904
+ //! @param decomposer
1905
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1906
+ //! references to its constituent arithmetic types. The leftmost element of
1907
+ //! the tuple is considered the most significant. The call operator must not
1908
+ //! modify members of the key.
1909
+ template <class DecomposerT>
1910
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1911
+ ::cuda::std::enable_if_t< //
1912
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1913
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
1914
+ {
1915
+ NullType values[ItemsPerThread];
1916
+
1917
+ SortBlockedToStriped(
1918
+ keys,
1919
+ values,
1920
+ 0,
1921
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1922
+ ::cuda::std::true_type(),
1923
+ detail::bool_constant_v<KEYS_ONLY>,
1924
+ decomposer);
1925
+ }
1926
+
1927
+ //! @rst
1928
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1929
+ //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
1930
+ //!
1931
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1932
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1933
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1934
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1935
+ //! shared memory.
1936
+ //! - @granularity
1937
+ //! - @smemreuse
1938
+ //!
1939
+ //! Snippet
1940
+ //! +++++++
1941
+ //!
1942
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1943
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1944
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1945
+ //!
1946
+ //! .. code-block:: c++
1947
+ //!
1948
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1949
+ //!
1950
+ //! __global__ void ExampleKernel(...)
1951
+ //! {
1952
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1953
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1954
+ //!
1955
+ //! // Allocate shared memory for BlockRadixSort
1956
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1957
+ //!
1958
+ //! // Obtain a segment of consecutive items that are blocked across threads
1959
+ //! int thread_keys[4];
1960
+ //! int thread_values[4];
1961
+ //! ...
1962
+ //!
1963
+ //! // Collectively sort the keys and values among block threads
1964
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1965
+ //!
1966
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1967
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1968
+ //! The corresponding output ``thread_keys`` in those threads will be
1969
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1970
+ //!
1971
+ //! @endrst
1972
+ //!
1973
+ //! @param[in,out] keys
1974
+ //! Keys to sort
1975
+ //!
1976
+ //! @param[in,out] values
1977
+ //! Values to sort
1978
+ //!
1979
+ //! @param[in] begin_bit
1980
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1981
+ //!
1982
+ //! @param[in] end_bit
1983
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1984
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
1985
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1986
+ {
1987
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1988
+ }
1989
+
1990
+ //! @rst
1991
+ //! Performs a descending block-wide radix sort over a
1992
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1993
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1994
+ //!
1995
+ //! * @granularity
1996
+ //! * @smemreuse
1997
+ //!
1998
+ //! Snippet
1999
+ //! ==========================================================================
2000
+ //!
2001
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2002
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2003
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2004
+ //! tuple of references to relevant members of the key.
2005
+ //!
2006
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2007
+ //! :language: c++
2008
+ //! :dedent:
2009
+ //! :start-after: example-begin custom-type
2010
+ //! :end-before: example-end custom-type
2011
+ //!
2012
+ //! The code snippet below illustrates a sort of 4 keys and values that
2013
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2014
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
2015
+ //!
2016
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2017
+ //! :language: c++
2018
+ //! :dedent:
2019
+ //! :start-after: example-begin pairs-striped-descending-bits
2020
+ //! :end-before: example-end pairs-striped-descending-bits
2021
+ //!
2022
+ //! @endrst
2023
+ //!
2024
+ //! @tparam DecomposerT
2025
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2026
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2027
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2028
+ //! The leftmost element of the tuple is considered the most significant.
2029
+ //! The call operator must not modify members of the key.
2030
+ //!
2031
+ //! @param[in,out] keys
2032
+ //! Keys to sort
2033
+ //!
2034
+ //! @param[in,out] values
2035
+ //! Values to sort
2036
+ //!
2037
+ //! @param decomposer
2038
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2039
+ //! references to its constituent arithmetic types. The leftmost element of
2040
+ //! the tuple is considered the most significant. The call operator must not
2041
+ //! modify members of the key.
2042
+ //!
2043
+ //! @param[in] begin_bit
2044
+ //! The least-significant bit index (inclusive) needed for
2045
+ //! key comparison
2046
+ //!
2047
+ //! @param[in] end_bit
2048
+ //! The most-significant bit index (exclusive) needed for key
2049
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2050
+ template <class DecomposerT>
2051
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2052
+ ::cuda::std::enable_if_t< //
2053
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2054
+ SortDescendingBlockedToStriped(
2055
+ KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
2056
+ {
2057
+ SortBlockedToStriped(
2058
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
2059
+ }
2060
+
2061
+ //! @rst
2062
+ //! Performs a descending block-wide radix sort over a
2063
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2064
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2065
+ //!
2066
+ //! * @granularity
2067
+ //! * @smemreuse
2068
+ //!
2069
+ //! Snippet
2070
+ //! ==========================================================================
2071
+ //!
2072
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2073
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2074
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2075
+ //! tuple of references to relevant members of the key.
2076
+ //!
2077
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2078
+ //! :language: c++
2079
+ //! :dedent:
2080
+ //! :start-after: example-begin custom-type
2081
+ //! :end-before: example-end custom-type
2082
+ //!
2083
+ //! The code snippet below illustrates a sort of 6 keys and values that
2084
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2085
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
2086
+ //!
2087
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2088
+ //! :language: c++
2089
+ //! :dedent:
2090
+ //! :start-after: example-begin pairs-striped-descending
2091
+ //! :end-before: example-end pairs-striped-descending
2092
+ //!
2093
+ //! @endrst
2094
+ //!
2095
+ //! @tparam DecomposerT
2096
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2097
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2098
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2099
+ //! The leftmost element of the tuple is considered the most significant.
2100
+ //! The call operator must not modify members of the key.
2101
+ //!
2102
+ //! @param[in,out] keys
2103
+ //! Keys to sort
2104
+ //!
2105
+ //! @param[in,out] values
2106
+ //! Values to sort
2107
+ //!
2108
+ //! @param decomposer
2109
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2110
+ //! references to its constituent arithmetic types. The leftmost element of
2111
+ //! the tuple is considered the most significant. The call operator must not
2112
+ //! modify members of the key.
2113
+ template <class DecomposerT>
2114
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2115
+ ::cuda::std::enable_if_t< //
2116
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2117
+ SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
2118
+ {
2119
+ SortBlockedToStriped(
2120
+ keys,
2121
+ values,
2122
+ 0,
2123
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
2124
+ ::cuda::std::true_type(),
2125
+ detail::bool_constant_v<KEYS_ONLY>,
2126
+ decomposer);
2127
+ }
2128
+
2129
+ //@} end member group
2130
+ };
2131
+
2132
+ CUB_NAMESPACE_END