cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1962) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +3 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  5. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  6. cuda/cccl/cooperative/experimental/_common.py +275 -0
  7. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  8. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  9. cuda/cccl/cooperative/experimental/_types.py +937 -0
  10. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  11. cuda/cccl/cooperative/experimental/block/__init__.py +39 -0
  12. cuda/cccl/cooperative/experimental/block/_block_exchange.py +251 -0
  13. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  14. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  15. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  16. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  17. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  18. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +92 -0
  20. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  21. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  22. cuda/cccl/headers/__init__.py +7 -0
  23. cuda/cccl/headers/include/__init__.py +1 -0
  24. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +262 -0
  25. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1185 -0
  26. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  27. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +927 -0
  28. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +232 -0
  29. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +730 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +289 -0
  32. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +706 -0
  33. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +558 -0
  34. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  35. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  36. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1127 -0
  37. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +585 -0
  38. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +477 -0
  39. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  40. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1120 -0
  41. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +341 -0
  42. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +609 -0
  43. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  44. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  45. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  46. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  47. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  48. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1308 -0
  49. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  50. cuda/cccl/headers/include/cub/block/block_load.cuh +1260 -0
  51. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  52. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1220 -0
  53. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2194 -0
  54. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  55. cuda/cccl/headers/include/cub/block/block_reduce.cuh +666 -0
  56. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  57. cuda/cccl/headers/include/cub/block/block_scan.cuh +2584 -0
  58. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  59. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  60. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  65. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  66. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  67. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  68. cuda/cccl/headers/include/cub/config.cuh +53 -0
  69. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  70. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  71. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  72. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  73. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  74. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  75. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  76. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  77. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  78. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  79. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  82. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  83. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  84. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  85. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  86. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  87. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  88. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  89. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  90. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  91. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  92. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  93. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  94. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  95. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  96. cuda/cccl/headers/include/cub/device/device_for.cuh +985 -0
  97. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  98. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  99. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  100. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  101. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  102. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  103. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2519 -0
  104. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  105. cuda/cccl/headers/include/cub/device/device_scan.cuh +2205 -0
  106. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  107. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1520 -0
  108. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  109. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  110. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  111. cuda/cccl/headers/include/cub/device/device_transform.cuh +637 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +111 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +304 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +474 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1753 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1327 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +536 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +314 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +500 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +917 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +342 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +441 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +561 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +226 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +578 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +192 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +324 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +475 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1009 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +79 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  154. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  155. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  156. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +676 -0
  157. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +621 -0
  158. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  159. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  160. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  161. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +443 -0
  162. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  163. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +454 -0
  164. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  165. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  166. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  167. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  168. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  169. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  170. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  171. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  172. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  173. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  174. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +541 -0
  175. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  176. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  177. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  178. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  179. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  180. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  181. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  182. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  183. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  184. cuda/cccl/headers/include/cub/util_device.cuh +784 -0
  185. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  186. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  187. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  188. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  189. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  190. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  191. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  192. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  193. cuda/cccl/headers/include/cub/version.cuh +89 -0
  194. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  195. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  196. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +736 -0
  197. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +407 -0
  198. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  199. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  200. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  201. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  202. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  203. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +824 -0
  204. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1886 -0
  205. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  206. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  207. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  208. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  209. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  210. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  211. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  212. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  213. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  214. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  215. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  216. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  217. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  218. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  219. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  220. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  221. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +468 -0
  222. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  223. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  224. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  225. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  226. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  227. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  228. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  229. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  230. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +249 -0
  231. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  232. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  233. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  234. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  235. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  236. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  237. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  238. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  239. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  240. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  241. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  242. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  243. cuda/cccl/headers/include/cuda/__device/all_devices.h +240 -0
  244. cuda/cccl/headers/include/cuda/__device/arch_traits.h +613 -0
  245. cuda/cccl/headers/include/cuda/__device/attributes.h +721 -0
  246. cuda/cccl/headers/include/cuda/__device/device_ref.h +185 -0
  247. cuda/cccl/headers/include/cuda/__device/physical_device.h +168 -0
  248. cuda/cccl/headers/include/cuda/__driver/driver_api.h +541 -0
  249. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  250. cuda/cccl/headers/include/cuda/__event/event_ref.h +158 -0
  251. cuda/cccl/headers/include/cuda/__event/timed_event.h +118 -0
  252. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  253. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  254. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  255. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  256. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  257. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  258. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  259. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  260. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  261. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  262. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  263. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  264. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  265. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  266. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +49 -0
  267. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +300 -0
  268. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  269. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  270. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  271. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  272. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +386 -0
  273. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +344 -0
  274. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +498 -0
  275. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +501 -0
  276. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +461 -0
  277. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  278. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +673 -0
  279. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  280. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  281. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  282. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  283. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  286. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  287. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  288. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  289. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  290. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  291. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  292. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  293. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  294. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  295. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  296. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  297. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  298. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  299. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  300. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  301. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  302. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  303. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  304. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  305. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  306. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  307. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +69 -0
  308. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  309. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +654 -0
  310. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  311. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  312. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  313. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  314. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  315. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  414. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  415. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  416. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  417. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  418. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  419. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  420. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  421. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  422. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  423. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  424. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  425. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  426. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +97 -0
  427. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  428. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  429. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  430. cuda/cccl/headers/include/cuda/__stream/stream.h +142 -0
  431. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +296 -0
  432. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  433. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  447. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  448. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  449. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  450. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  451. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  452. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  453. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  454. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  455. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  456. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  457. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  458. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  459. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  460. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  461. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  462. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  463. cuda/cccl/headers/include/cuda/access_property +26 -0
  464. cuda/cccl/headers/include/cuda/algorithm +27 -0
  465. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  466. cuda/cccl/headers/include/cuda/atomic +27 -0
  467. cuda/cccl/headers/include/cuda/barrier +267 -0
  468. cuda/cccl/headers/include/cuda/bit +29 -0
  469. cuda/cccl/headers/include/cuda/cmath +36 -0
  470. cuda/cccl/headers/include/cuda/devices +20 -0
  471. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  472. cuda/cccl/headers/include/cuda/functional +32 -0
  473. cuda/cccl/headers/include/cuda/iterator +38 -0
  474. cuda/cccl/headers/include/cuda/latch +27 -0
  475. cuda/cccl/headers/include/cuda/mdspan +28 -0
  476. cuda/cccl/headers/include/cuda/memory +34 -0
  477. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  478. cuda/cccl/headers/include/cuda/numeric +29 -0
  479. cuda/cccl/headers/include/cuda/pipeline +579 -0
  480. cuda/cccl/headers/include/cuda/ptx +128 -0
  481. cuda/cccl/headers/include/cuda/semaphore +31 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  571. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  572. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  573. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  574. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  575. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  576. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  577. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  593. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  594. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  595. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  596. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  597. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  598. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  599. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  600. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  601. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  602. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  606. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  607. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  608. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  609. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  610. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  611. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  612. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +676 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1284 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  635. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  636. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  637. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  641. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  642. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  643. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  651. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  652. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  653. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +260 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  678. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  679. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  680. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/complex.h +674 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +322 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +321 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  693. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  694. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  695. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  716. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  717. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  718. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  719. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  720. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  722. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  723. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  724. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  725. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  726. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  727. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  728. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  729. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +146 -0
  730. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  731. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  732. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  734. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1956 -0
  735. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  736. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  737. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +172 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +113 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  750. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  751. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  752. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  753. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  754. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  755. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  761. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  762. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  767. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  768. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  769. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  770. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  771. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  772. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  773. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/function.h +1278 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  797. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  798. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  799. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  800. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +268 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +66 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +90 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  818. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  819. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  821. cuda/cccl/headers/include/cuda/std/__internal/features.h +77 -0
  822. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +122 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  858. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  859. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  860. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  861. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  862. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  866. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  867. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +144 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +758 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +497 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  878. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  879. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +532 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  899. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  900. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  901. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  902. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  903. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  904. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  905. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  918. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  924. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +432 -0
  925. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  926. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  927. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  928. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  929. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  930. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  931. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  932. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +314 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  960. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  962. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  963. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  964. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  965. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  966. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  967. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  969. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  974. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +218 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +80 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +64 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  985. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +162 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1125. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +106 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/pair.h +796 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1148. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1149. cuda/cccl/headers/include/cuda/std/array +518 -0
  1150. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1151. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1152. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1153. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1154. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1155. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1156. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1157. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1158. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1159. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1160. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1161. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1162. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1164. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1165. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1166. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +204 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1174. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2142 -0
  1175. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1176. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1177. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1178. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1179. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1180. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1181. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1182. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1183. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1184. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1185. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1186. cuda/cccl/headers/include/cuda/std/numbers +341 -0
  1187. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1188. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1189. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1190. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1191. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1192. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1193. cuda/cccl/headers/include/cuda/std/span +628 -0
  1194. cuda/cccl/headers/include/cuda/std/string_view +799 -0
  1195. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1196. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1197. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1198. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1199. cuda/cccl/headers/include/cuda/std/version +243 -0
  1200. cuda/cccl/headers/include/cuda/stream +31 -0
  1201. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1202. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1203. cuda/cccl/headers/include/cuda/utility +27 -0
  1204. cuda/cccl/headers/include/cuda/version +16 -0
  1205. cuda/cccl/headers/include/cuda/warp +28 -0
  1206. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1207. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1208. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1209. cuda/cccl/headers/include/nv/target +235 -0
  1210. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1211. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1212. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1213. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1214. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1215. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1216. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1217. cuda/cccl/headers/include/thrust/count.h +245 -0
  1218. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1219. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1220. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1230. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1231. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1232. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1233. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1234. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1235. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +79 -0
  1236. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1237. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1238. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1239. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1240. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1252. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1253. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1254. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1255. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1256. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1257. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1258. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1259. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1260. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1261. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1262. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1263. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1264. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1265. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1266. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1267. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1268. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1269. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1271. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1272. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1273. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1274. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1275. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1276. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1277. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1278. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1279. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1280. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1281. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1282. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1283. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1284. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1285. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1286. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1287. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1288. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1289. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1290. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1291. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1292. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1293. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1294. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1295. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1296. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1297. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1298. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1299. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1301. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1302. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1303. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1304. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1305. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1306. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1307. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1308. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1309. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1310. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1311. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1312. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1313. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1314. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1315. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1316. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1317. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1318. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1319. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1320. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1321. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1322. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1323. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1324. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1325. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1326. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1327. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1328. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1329. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1330. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1331. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1332. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1333. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1334. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1335. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1336. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1337. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1338. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1339. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1340. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1341. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1342. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1343. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1344. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1345. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1346. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1347. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1348. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1349. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1350. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1351. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1352. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1353. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1354. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1355. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1356. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1357. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1358. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1359. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1360. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1361. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1362. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1363. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1364. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1365. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1366. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1367. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1368. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1369. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1370. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1371. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1372. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1373. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1374. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1375. cuda/cccl/headers/include/thrust/find.h +382 -0
  1376. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1377. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1378. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1379. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1380. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1381. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1382. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1383. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1384. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1385. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1386. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1387. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1388. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1389. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1390. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1391. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1392. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1393. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1394. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1395. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1396. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1397. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1398. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1399. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1400. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1401. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1402. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +323 -0
  1403. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1404. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1405. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1406. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1407. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1408. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1409. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1410. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1411. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1412. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1413. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1414. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1415. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1416. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1417. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1418. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1419. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1420. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1421. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1422. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1423. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1424. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1425. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1426. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1427. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1428. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1429. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1430. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1431. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1432. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1433. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1434. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1435. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1436. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1437. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1438. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1439. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1440. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1441. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1442. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1443. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1444. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1445. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1446. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1447. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1448. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1449. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1450. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1451. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1452. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1453. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1454. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1455. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1456. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1457. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1458. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1459. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1460. cuda/cccl/headers/include/thrust/random.h +120 -0
  1461. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1462. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1463. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1464. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1465. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1466. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1467. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1468. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1469. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1470. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1471. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1506. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1507. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1508. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1509. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1511. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1512. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1513. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1514. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1515. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1516. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1517. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1518. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1519. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1520. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1521. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +240 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +470 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1569. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1570. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1571. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1572. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1573. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1574. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1575. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1576. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1577. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1578. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1579. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1580. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1581. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1582. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1583. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1584. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1756. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1757. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1758. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1759. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1760. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1761. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1762. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1763. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1764. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1765. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1766. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1767. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1768. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1769. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1770. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1771. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1772. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1823. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1824. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1825. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1828. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1829. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1830. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1831. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1832. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1833. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1834. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1835. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1836. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1837. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1838. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1887. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1888. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1889. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1890. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1891. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1892. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1893. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1894. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1895. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1896. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1897. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1898. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1899. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1900. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1901. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1902. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1903. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1904. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1905. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1906. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1907. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1908. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1909. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1910. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1911. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1912. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1913. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1914. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1915. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1916. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1917. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1918. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1919. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1920. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1921. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1922. cuda/cccl/headers/include/thrust/version.h +93 -0
  1923. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1924. cuda/cccl/headers/include_paths.py +51 -0
  1925. cuda/cccl/parallel/__init__.py +9 -0
  1926. cuda/cccl/parallel/experimental/.gitignore +4 -0
  1927. cuda/cccl/parallel/experimental/__init__.py +73 -0
  1928. cuda/cccl/parallel/experimental/_bindings.py +79 -0
  1929. cuda/cccl/parallel/experimental/_bindings.pyi +405 -0
  1930. cuda/cccl/parallel/experimental/_bindings_impl.pyx +1984 -0
  1931. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1932. cuda/cccl/parallel/experimental/_cccl_interop.py +422 -0
  1933. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1934. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1935. cuda/cccl/parallel/experimental/_utils/temp_storage_buffer.py +86 -0
  1936. cuda/cccl/parallel/experimental/algorithms/__init__.py +50 -0
  1937. cuda/cccl/parallel/experimental/algorithms/_histogram.py +243 -0
  1938. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +225 -0
  1939. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +312 -0
  1940. cuda/cccl/parallel/experimental/algorithms/_reduce.py +184 -0
  1941. cuda/cccl/parallel/experimental/algorithms/_scan.py +261 -0
  1942. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +257 -0
  1943. cuda/cccl/parallel/experimental/algorithms/_transform.py +308 -0
  1944. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +252 -0
  1945. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1946. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  1947. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  1948. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  1949. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  1950. cuda/cccl/parallel/experimental/iterators/__init__.py +19 -0
  1951. cuda/cccl/parallel/experimental/iterators/_factories.py +191 -0
  1952. cuda/cccl/parallel/experimental/iterators/_iterators.py +612 -0
  1953. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +199 -0
  1954. cuda/cccl/parallel/experimental/numba_utils.py +53 -0
  1955. cuda/cccl/parallel/experimental/op.py +3 -0
  1956. cuda/cccl/parallel/experimental/struct.py +272 -0
  1957. cuda/cccl/parallel/experimental/typing.py +35 -0
  1958. cuda/cccl/py.typed +0 -0
  1959. cuda_cccl-0.1.3.2.0.dev438.dist-info/METADATA +42 -0
  1960. cuda_cccl-0.1.3.2.0.dev438.dist-info/RECORD +1962 -0
  1961. cuda_cccl-0.1.3.2.0.dev438.dist-info/WHEEL +5 -0
  1962. cuda_cccl-0.1.3.2.0.dev438.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2194 @@
1
+ /******************************************************************************
2
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4
+ *
5
+ * Redistribution and use in source and binary forms, with or without
6
+ * modification, are permitted provided that the following conditions are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of the NVIDIA CORPORATION nor the
13
+ * names of its contributors may be used to endorse or promote products
14
+ * derived from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ *
27
+ ******************************************************************************/
28
+
29
+ /**
30
+ * @file
31
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
32
+ * sorting of items partitioned across a CUDA thread block.
33
+ */
34
+
35
+ #pragma once
36
+
37
+ #include <cub/config.cuh>
38
+
39
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
40
+ # pragma GCC system_header
41
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
42
+ # pragma clang system_header
43
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
44
+ # pragma system_header
45
+ #endif // no system header
46
+
47
+ #include <cub/block/block_exchange.cuh>
48
+ #include <cub/block/block_radix_rank.cuh>
49
+ #include <cub/block/radix_rank_sort_operations.cuh>
50
+ #include <cub/util_ptx.cuh>
51
+ #include <cub/util_type.cuh>
52
+
53
+ #include <cuda/std/__algorithm/min.h>
54
+ #include <cuda/std/__type_traits/enable_if.h>
55
+ #include <cuda/std/__type_traits/integral_constant.h>
56
+ #include <cuda/std/__type_traits/is_convertible.h>
57
+ #include <cuda/std/__type_traits/is_same.h>
58
+
59
+ CUB_NAMESPACE_BEGIN
60
+
61
+ //! @rst
62
+ //! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
63
+ //! items partitioned across a CUDA thread block using a radix sorting method.
64
+ //!
65
+ //! .. image:: ../../img/sorting_logo.png
66
+ //! :align: center
67
+ //!
68
+ //! Overview
69
+ //! --------------------------------------------------
70
+ //!
71
+ //! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
72
+ //! items into ascending order. It relies upon a positional representation for
73
+ //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
74
+ //! characters, etc.) specified from least-significant to most-significant. For a
75
+ //! given input sequence of keys and a set of rules specifying a total ordering
76
+ //! of the symbolic alphabet, the radix sorting method produces a lexicographic
77
+ //! ordering of those keys.
78
+ //!
79
+ //! @rowmajor
80
+ //!
81
+ //! Supported Types
82
+ //! --------------------------------------------------
83
+ //!
84
+ //! BlockRadixSort can sort all of the built-in C++ numeric primitive types
85
+ //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
86
+ //! half-precision floating-point type. User-defined types are supported as long
87
+ //! as decomposer object is provided.
88
+ //!
89
+ //! Floating-Point Special Cases
90
+ //! --------------------------------------------------
91
+ //!
92
+ //! - Positive and negative zeros are considered equivalent, and will be treated
93
+ //! as such in the output.
94
+ //! - No special handling is implemented for NaN values; these are sorted
95
+ //! according to their bit representations after any transformations.
96
+ //!
97
+ //! Bitwise Key Transformations
98
+ //! --------------------------------------------------
99
+ //!
100
+ //! Although the direct radix sorting method can only be applied to unsigned
101
+ //! integral types, BlockRadixSort is able to sort signed and floating-point
102
+ //! types via simple bit-wise transformations that ensure lexicographic key
103
+ //! ordering.
104
+ //!
105
+ //! These transformations must be considered when restricting the
106
+ //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
107
+ //! before the bit-range truncation.
108
+ //!
109
+ //! Any transformations applied to the keys prior to sorting are reversed
110
+ //! while writing to the final output buffer.
111
+ //!
112
+ //! Type Specific Bitwise Transformations
113
+ //! --------------------------------------------------
114
+ //!
115
+ //! To convert the input values into a radix-sortable bitwise representation,
116
+ //! the following transformations take place prior to sorting:
117
+ //!
118
+ //! * For unsigned integral values, the keys are used directly.
119
+ //! * For signed integral values, the sign bit is inverted.
120
+ //! * For positive floating point values, the sign bit is inverted.
121
+ //! * For negative floating point values, the full key is inverted.
122
+ //!
123
+ //! No Descending Sort Transformations
124
+ //! --------------------------------------------------
125
+ //!
126
+ //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
127
+ //! when performing a descending sort. Instead, it has special logic to reverse
128
+ //! the order of the keys while sorting.
129
+ //!
130
+ //! Stability
131
+ //! --------------------------------------------------
132
+ //!
133
+ //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
134
+ //! are considered equal and appear in the result in the same order as they
135
+ //! appear in the input.
136
+ //!
137
+ //!
138
+ //! Performance Considerations
139
+ //! --------------------------------------------------
140
+ //!
141
+ //! * @granularity
142
+ //!
143
+ //! A Simple Example
144
+ //! --------------------------------------------------
145
+ //!
146
+ //! @blockcollective{BlockRadixSort}
147
+ //!
148
+ //! The code snippet below illustrates a sort of 512 integer keys that
149
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
150
+ //! where each thread owns 4 consecutive items.
151
+ //!
152
+ //! .. tab-set-code::
153
+ //!
154
+ //! .. code-block:: c++
155
+ //!
156
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
157
+ //!
158
+ //! __global__ void kernel(...)
159
+ //! {
160
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
161
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
162
+ //!
163
+ //! // Allocate shared memory for BlockRadixSort
164
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
165
+ //!
166
+ //! // Obtain a segment of consecutive items that are blocked across threads
167
+ //! int thread_keys[4];
168
+ //! ...
169
+ //!
170
+ //! // Collectively sort the keys
171
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
172
+ //!
173
+ //! ...
174
+ //!
175
+ //! .. code-block:: python
176
+ //!
177
+ //! import cuda.cccl.cooperative.experimental as cudax
178
+ //!
179
+ //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
180
+ //! block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
181
+ //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
182
+ //!
183
+ //! @cuda.jit(link=block_radix_sort.files)
184
+ //! def kernel():
185
+ //! Allocate shared memory for radix sort
186
+ //! temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
187
+ //!
188
+ //! # Obtain a segment of consecutive items that are blocked across threads
189
+ //! thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
190
+ //! # ...
191
+ //!
192
+ //! // Collectively sort the keys
193
+ //! block_radix_sort(temp_storage, thread_keys)
194
+ //! # ...
195
+ //!
196
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
197
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
198
+ //! The corresponding output ``thread_keys`` in those threads will be
199
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
200
+ //!
201
+ //! Re-using dynamically allocating shared memory
202
+ //! --------------------------------------------------
203
+ //!
204
+ //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
205
+ //! BlockReduce and how to re-purpose the same memory region.
206
+ //!
207
+ //! This example can be easily adapted to the storage required by BlockRadixSort.
208
+ //! @endrst
209
+ //!
210
+ //! @tparam KeyT
211
+ //! KeyT type
212
+ //!
213
+ //! @tparam BLOCK_DIM_X
214
+ //! The thread block length in threads along the X dimension
215
+ //!
216
+ //! @tparam ITEMS_PER_THREAD
217
+ //! The number of items per thread
218
+ //!
219
+ //! @tparam ValueT
220
+ //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
221
+ //!
222
+ //! @tparam RADIX_BITS
223
+ //! **[optional]** The number of radix bits per digit place (default: 4 bits)
224
+ //!
225
+ //! @tparam MEMOIZE_OUTER_SCAN
226
+ //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
227
+ //! reads at the expense of higher register pressure (default: true for architectures SM35 and
228
+ //! newer, false otherwise).
229
+ //!
230
+ //! @tparam INNER_SCAN_ALGORITHM
231
+ //! **[optional]** The cub::BlockScanAlgorithm algorithm to use
232
+ //! (default: cub::BLOCK_SCAN_WARP_SCANS)
233
+ //!
234
+ //! @tparam SMEM_CONFIG
235
+ //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
236
+ //!
237
+ //! @tparam BLOCK_DIM_Y
238
+ //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
239
+ //!
240
+ //! @tparam BLOCK_DIM_Z
241
+ //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
242
+ //!
243
+ template <typename KeyT,
244
+ int BLOCK_DIM_X,
245
+ int ITEMS_PER_THREAD,
246
+ typename ValueT = NullType,
247
+ int RADIX_BITS = 4,
248
+ bool MEMOIZE_OUTER_SCAN = true,
249
+ BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
250
+ cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
251
+ int BLOCK_DIM_Y = 1,
252
+ int BLOCK_DIM_Z = 1>
253
+ class BlockRadixSort
254
+ {
255
+ private:
256
+ /******************************************************************************
257
+ * Constants and type definitions
258
+ ******************************************************************************/
259
+
260
+ enum
261
+ {
262
+ // The thread block size in threads
263
+ BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
264
+
265
+ // Whether or not there are values to be trucked along with keys
266
+ KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>,
267
+ };
268
+
269
+ // KeyT traits and unsigned bits type
270
+ using traits = detail::radix::traits_t<KeyT>;
271
+ using bit_ordered_type = typename traits::bit_ordered_type;
272
+ using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
273
+
274
+ /// Ascending BlockRadixRank utility type
275
+ using AscendingBlockRadixRank =
276
+ BlockRadixRank<BLOCK_DIM_X,
277
+ RADIX_BITS,
278
+ false,
279
+ MEMOIZE_OUTER_SCAN,
280
+ INNER_SCAN_ALGORITHM,
281
+ SMEM_CONFIG,
282
+ BLOCK_DIM_Y,
283
+ BLOCK_DIM_Z>;
284
+
285
+ /// Descending BlockRadixRank utility type
286
+ using DescendingBlockRadixRank =
287
+ BlockRadixRank<BLOCK_DIM_X,
288
+ RADIX_BITS,
289
+ true,
290
+ MEMOIZE_OUTER_SCAN,
291
+ INNER_SCAN_ALGORITHM,
292
+ SMEM_CONFIG,
293
+ BLOCK_DIM_Y,
294
+ BLOCK_DIM_Z>;
295
+
296
+ /// Digit extractor type
297
+ using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
298
+
299
+ /// BlockExchange utility type for keys
300
+ using BlockExchangeKeys = BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
301
+
302
+ /// BlockExchange utility type for values
303
+ using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
304
+
305
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
306
+ /// Shared memory storage layout type
307
+ union _TempStorage
308
+ {
309
+ typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
310
+ typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
311
+ typename BlockExchangeKeys::TempStorage exchange_keys;
312
+ typename BlockExchangeValues::TempStorage exchange_values;
313
+ };
314
+ #endif // _CCCL_DOXYGEN_INVOKED
315
+
316
+ /******************************************************************************
317
+ * Thread fields
318
+ ******************************************************************************/
319
+
320
+ /// Shared storage reference
321
+ _TempStorage& temp_storage;
322
+
323
+ /// Linear thread-id
324
+ unsigned int linear_tid;
325
+
326
+ /******************************************************************************
327
+ * Utility methods
328
+ ******************************************************************************/
329
+
330
+ /// Internal storage allocator
331
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
332
+ {
333
+ __shared__ _TempStorage private_storage;
334
+ return private_storage;
335
+ }
336
+
337
+ /// Rank keys (specialized for ascending sort)
338
+ template <class DigitExtractorT>
339
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
340
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
341
+ int (&ranks)[ITEMS_PER_THREAD],
342
+ DigitExtractorT digit_extractor,
343
+ ::cuda::std::false_type /*is_descending*/)
344
+ {
345
+ AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
346
+ }
347
+
348
+ /// Rank keys (specialized for descending sort)
349
+ template <class DigitExtractorT>
350
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
351
+ RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
352
+ int (&ranks)[ITEMS_PER_THREAD],
353
+ DigitExtractorT digit_extractor,
354
+ ::cuda::std::true_type /*is_descending*/)
355
+ {
356
+ DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
357
+ }
358
+
359
+ /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
360
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
361
+ ValueT (&values)[ITEMS_PER_THREAD],
362
+ int (&ranks)[ITEMS_PER_THREAD],
363
+ ::cuda::std::false_type /*is_keys_only*/,
364
+ ::cuda::std::true_type /*is_blocked*/)
365
+ {
366
+ __syncthreads();
367
+
368
+ // Exchange values through shared memory in blocked arrangement
369
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
370
+ }
371
+
372
+ /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
373
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
374
+ ValueT (&values)[ITEMS_PER_THREAD],
375
+ int (&ranks)[ITEMS_PER_THREAD],
376
+ ::cuda::std::false_type /*is_keys_only*/,
377
+ ::cuda::std::false_type /*is_blocked*/)
378
+ {
379
+ __syncthreads();
380
+
381
+ // Exchange values through shared memory in blocked arrangement
382
+ BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
383
+ }
384
+
385
+ /// ExchangeValues (specialized for keys-only sort)
386
+ template <bool IS_BLOCKED>
387
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
388
+ ValueT (& /*values*/)[ITEMS_PER_THREAD],
389
+ int (& /*ranks*/)[ITEMS_PER_THREAD],
390
+ ::cuda::std::true_type /*is_keys_only*/,
391
+ ::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
392
+ {}
393
+
394
+ /**
395
+ * @brief Sort blocked arrangement
396
+ *
397
+ * @param keys
398
+ * Keys to sort
399
+ *
400
+ * @param values
401
+ * Values to sort
402
+ *
403
+ * @param begin_bit
404
+ * The beginning (least-significant) bit index needed for key comparison
405
+ *
406
+ * @param end_bit
407
+ * The past-the-end (most-significant) bit index needed for key comparison
408
+ *
409
+ * @param is_descending
410
+ * Tag whether is a descending-order sort
411
+ *
412
+ * @param is_keys_only
413
+ * Tag whether is keys-only sort
414
+ */
415
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
416
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
417
+ KeyT (&keys)[ITEMS_PER_THREAD],
418
+ ValueT (&values)[ITEMS_PER_THREAD],
419
+ int begin_bit,
420
+ int end_bit,
421
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
422
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
423
+ DecomposerT decomposer = {})
424
+ {
425
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
426
+
427
+ _CCCL_PRAGMA_UNROLL_FULL()
428
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
429
+ {
430
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
431
+ }
432
+
433
+ // Radix sorting passes
434
+ while (true)
435
+ {
436
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
437
+ auto digit_extractor =
438
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
439
+
440
+ // Rank the blocked keys
441
+ int ranks[ITEMS_PER_THREAD];
442
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
443
+ begin_bit += RADIX_BITS;
444
+
445
+ __syncthreads();
446
+
447
+ // Exchange keys through shared memory in blocked arrangement
448
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
449
+
450
+ // Exchange values through shared memory in blocked arrangement
451
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
452
+
453
+ // Quit if done
454
+ if (begin_bit >= end_bit)
455
+ {
456
+ break;
457
+ }
458
+
459
+ __syncthreads();
460
+ }
461
+
462
+ // Untwiddle bits if necessary
463
+ _CCCL_PRAGMA_UNROLL_FULL()
464
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
465
+ {
466
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
467
+ }
468
+ }
469
+
470
+ public:
471
+ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
472
+
473
+ /**
474
+ * @brief Sort blocked -> striped arrangement
475
+ *
476
+ * @param keys
477
+ * Keys to sort
478
+ *
479
+ * @param values
480
+ * Values to sort
481
+ *
482
+ * @param begin_bit
483
+ * The beginning (least-significant) bit index needed for key comparison
484
+ *
485
+ * @param end_bit
486
+ * The past-the-end (most-significant) bit index needed for key comparison
487
+ *
488
+ * @param is_descending
489
+ * Tag whether is a descending-order sort
490
+ *
491
+ * @param is_keys_only
492
+ * Tag whether is keys-only sort
493
+ */
494
+ template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
495
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
496
+ KeyT (&keys)[ITEMS_PER_THREAD],
497
+ ValueT (&values)[ITEMS_PER_THREAD],
498
+ int begin_bit,
499
+ int end_bit,
500
+ ::cuda::std::bool_constant<DESCENDING> is_descending,
501
+ ::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
502
+ DecomposerT decomposer = {})
503
+ {
504
+ bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
505
+
506
+ _CCCL_PRAGMA_UNROLL_FULL()
507
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
508
+ {
509
+ unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
510
+ }
511
+
512
+ // Radix sorting passes
513
+ while (true)
514
+ {
515
+ int pass_bits = ::cuda::std::min(RADIX_BITS, end_bit - begin_bit);
516
+ auto digit_extractor =
517
+ traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
518
+
519
+ // Rank the blocked keys
520
+ int ranks[ITEMS_PER_THREAD];
521
+ RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
522
+ begin_bit += RADIX_BITS;
523
+
524
+ __syncthreads();
525
+
526
+ // Check if this is the last pass
527
+ if (begin_bit >= end_bit)
528
+ {
529
+ // Last pass exchanges keys through shared memory in striped arrangement
530
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
531
+
532
+ // Last pass exchanges through shared memory in striped arrangement
533
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::false_type());
534
+
535
+ // Quit
536
+ break;
537
+ }
538
+
539
+ // Exchange keys through shared memory in blocked arrangement
540
+ BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
541
+
542
+ // Exchange values through shared memory in blocked arrangement
543
+ ExchangeValues(values, ranks, is_keys_only, ::cuda::std::true_type());
544
+
545
+ __syncthreads();
546
+ }
547
+
548
+ // Untwiddle bits if necessary
549
+ _CCCL_PRAGMA_UNROLL_FULL()
550
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
551
+ {
552
+ unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
553
+ }
554
+ }
555
+
556
+ #endif // _CCCL_DOXYGEN_INVOKED
557
+
558
+ /// @smemstorage{BlockRadixSort}
559
+ struct TempStorage : Uninitialized<_TempStorage>
560
+ {};
561
+
562
+ //! @name Collective constructors
563
+ //! @{
564
+
565
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
566
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
567
+ : temp_storage(PrivateStorage())
568
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
569
+ {}
570
+
571
+ /**
572
+ * @brief Collective constructor using the specified memory allocation as temporary storage.
573
+ *
574
+ * @param[in] temp_storage
575
+ * Reference to memory allocation having layout type TempStorage
576
+ */
577
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
578
+ : temp_storage(temp_storage.Alias())
579
+ , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
580
+ {}
581
+
582
+ //! @} end member group
583
+ //! @name Sorting (blocked arrangements)
584
+ //! @{
585
+
586
+ //! @rst
587
+ //! Performs an ascending block-wide radix sort over a
588
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
589
+ //!
590
+ //! - @granularity
591
+ //! - @smemreuse
592
+ //!
593
+ //! Snippet
594
+ //! +++++++
595
+ //!
596
+ //! The code snippet below illustrates a sort of 512 integer keys that
597
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
598
+ //! where each thread owns 4 consecutive keys.
599
+ //!
600
+ //! .. code-block:: c++
601
+ //!
602
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
603
+ //!
604
+ //! __global__ void ExampleKernel(...)
605
+ //! {
606
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
607
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
608
+ //!
609
+ //! // Allocate shared memory for BlockRadixSort
610
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
611
+ //!
612
+ //! // Obtain a segment of consecutive items that are blocked across threads
613
+ //! int thread_keys[4];
614
+ //! ...
615
+ //!
616
+ //! // Collectively sort the keys
617
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
618
+ //!
619
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
620
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
621
+ //! The corresponding output ``thread_keys`` in those threads will be
622
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
623
+ //! @endrst
624
+ //!
625
+ //! @param[in,out] keys
626
+ //! Keys to sort
627
+ //!
628
+ //! @param[in] begin_bit
629
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
630
+ //!
631
+ //! @param[in] end_bit
632
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
633
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
634
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
635
+ {
636
+ NullType values[ITEMS_PER_THREAD];
637
+
638
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
639
+ }
640
+
641
+ //! @rst
642
+ //! Performs an ascending block-wide radix sort over a
643
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
644
+ //!
645
+ //! * @granularity
646
+ //! * @smemreuse
647
+ //!
648
+ //! Snippet
649
+ //! ==========================================================================
650
+ //!
651
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
652
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
653
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
654
+ //! tuple of references to relevant members of the key.
655
+ //!
656
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
657
+ //! :language: c++
658
+ //! :dedent:
659
+ //! :start-after: example-begin custom-type
660
+ //! :end-before: example-end custom-type
661
+ //!
662
+ //! The code snippet below illustrates a sort of 2 keys that
663
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
664
+ //! where each thread owns 1 key.
665
+ //!
666
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
667
+ //! :language: c++
668
+ //! :dedent:
669
+ //! :start-after: example-begin keys-bits
670
+ //! :end-before: example-end keys-bits
671
+ //!
672
+ //! @endrst
673
+ //!
674
+ //! @tparam DecomposerT
675
+ //! **[inferred]** Type of a callable object responsible for decomposing a
676
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
677
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
678
+ //! The leftmost element of the tuple is considered the most significant.
679
+ //! The call operator must not modify members of the key.
680
+ //!
681
+ //! @param[in,out] keys
682
+ //! Keys to sort
683
+ //!
684
+ //! @param decomposer
685
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
686
+ //! references to its constituent arithmetic types. The leftmost element of
687
+ //! the tuple is considered the most significant. The call operator must not
688
+ //! modify members of the key.
689
+ //!
690
+ //! @param[in] begin_bit
691
+ //! The least-significant bit index (inclusive) needed for
692
+ //! key comparison
693
+ //!
694
+ //! @param[in] end_bit
695
+ //! The most-significant bit index (exclusive) needed for key
696
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
697
+ template <class DecomposerT>
698
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
699
+ ::cuda::std::enable_if_t< //
700
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
701
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
702
+ {
703
+ NullType values[ITEMS_PER_THREAD];
704
+
705
+ SortBlocked(
706
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
707
+ }
708
+
709
+ //! @rst
710
+ //! Performs an ascending block-wide radix sort over a
711
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
712
+ //!
713
+ //! * @granularity
714
+ //! * @smemreuse
715
+ //!
716
+ //! Snippet
717
+ //! ==========================================================================
718
+ //!
719
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
720
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
721
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
722
+ //! tuple of references to relevant members of the key.
723
+ //!
724
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
725
+ //! :language: c++
726
+ //! :dedent:
727
+ //! :start-after: example-begin custom-type
728
+ //! :end-before: example-end custom-type
729
+ //!
730
+ //! The code snippet below illustrates a sort of 6 keys that
731
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
732
+ //! where each thread owns 3 consecutive keys.
733
+ //!
734
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
735
+ //! :language: c++
736
+ //! :dedent:
737
+ //! :start-after: example-begin keys
738
+ //! :end-before: example-end keys
739
+ //!
740
+ //! @endrst
741
+ //!
742
+ //! @tparam DecomposerT
743
+ //! **[inferred]** Type of a callable object responsible for decomposing a
744
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
745
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
746
+ //! The leftmost element of the tuple is considered the most significant.
747
+ //! The call operator must not modify members of the key.
748
+ //!
749
+ //! @param[in,out] keys
750
+ //! Keys to sort
751
+ //!
752
+ //! @param decomposer
753
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
754
+ //! references to its constituent arithmetic types. The leftmost element of
755
+ //! the tuple is considered the most significant. The call operator must not
756
+ //! modify members of the key.
757
+ template <class DecomposerT>
758
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
759
+ ::cuda::std::enable_if_t< //
760
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
761
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
762
+ {
763
+ Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
764
+ }
765
+
766
+ //! @rst
767
+ //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
768
+ //! of keys and values.
769
+ //!
770
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
771
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
772
+ //! with a temporary value array that enumerates the key indices. The reordered indices
773
+ //! can then be used as a gather-vector for exchanging other associated tile data through
774
+ //! shared memory.
775
+ //! - @granularity
776
+ //! - @smemreuse
777
+ //!
778
+ //! Snippet
779
+ //! +++++++
780
+ //!
781
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
782
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
783
+ //! where each thread owns 4 consecutive pairs.
784
+ //!
785
+ //! .. code-block:: c++
786
+ //!
787
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
788
+ //!
789
+ //! __global__ void ExampleKernel(...)
790
+ //! {
791
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
792
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
793
+ //!
794
+ //! // Allocate shared memory for BlockRadixSort
795
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
796
+ //!
797
+ //! // Obtain a segment of consecutive items that are blocked across threads
798
+ //! int thread_keys[4];
799
+ //! int thread_values[4];
800
+ //! ...
801
+ //!
802
+ //! // Collectively sort the keys and values among block threads
803
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
804
+ //!
805
+ //! @endcode
806
+ //! @par
807
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
808
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
809
+ //! corresponding output ``thread_keys`` in those threads will be
810
+ //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
811
+ //!
812
+ //! @endrst
813
+ //!
814
+ //! @param[in,out] keys
815
+ //! Keys to sort
816
+ //!
817
+ //! @param[in,out] values
818
+ //! Values to sort
819
+ //!
820
+ //! @param[in] begin_bit
821
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
822
+ //!
823
+ //! @param[in] end_bit
824
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
825
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
826
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
827
+ ValueT (&values)[ITEMS_PER_THREAD],
828
+ int begin_bit = 0,
829
+ int end_bit = sizeof(KeyT) * 8)
830
+ {
831
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
832
+ }
833
+
834
+ //! @rst
835
+ //! Performs an ascending block-wide radix sort over a
836
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
837
+ //!
838
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
839
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
840
+ //! with a temporary value array that enumerates the key indices. The reordered indices
841
+ //! can then be used as a gather-vector for exchanging other associated tile data through
842
+ //! shared memory.
843
+ //! * @granularity
844
+ //! * @smemreuse
845
+ //!
846
+ //! Snippet
847
+ //! ==========================================================================
848
+ //!
849
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
850
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
851
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
852
+ //! tuple of references to relevant members of the key.
853
+ //!
854
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
855
+ //! :language: c++
856
+ //! :dedent:
857
+ //! :start-after: example-begin custom-type
858
+ //! :end-before: example-end custom-type
859
+ //!
860
+ //! The code snippet below illustrates a sort of 2 keys and values that
861
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
862
+ //! where each thread owns 1 pair.
863
+ //!
864
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
865
+ //! :language: c++
866
+ //! :dedent:
867
+ //! :start-after: example-begin pairs-bits
868
+ //! :end-before: example-end pairs-bits
869
+ //!
870
+ //! @endrst
871
+ //!
872
+ //! @tparam DecomposerT
873
+ //! **[inferred]** Type of a callable object responsible for decomposing a
874
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
875
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
876
+ //! The leftmost element of the tuple is considered the most significant.
877
+ //! The call operator must not modify members of the key.
878
+ //!
879
+ //! @param[in,out] keys
880
+ //! Keys to sort
881
+ //!
882
+ //! @param[in,out] values
883
+ //! Values to sort
884
+ //!
885
+ //! @param decomposer
886
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
887
+ //! references to its constituent arithmetic types. The leftmost element of
888
+ //! the tuple is considered the most significant. The call operator must not
889
+ //! modify members of the key.
890
+ //!
891
+ //! @param[in] begin_bit
892
+ //! The least-significant bit index (inclusive) needed for
893
+ //! key comparison
894
+ //!
895
+ //! @param[in] end_bit
896
+ //! The most-significant bit index (exclusive) needed for key
897
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
898
+ template <class DecomposerT>
899
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
900
+ ::cuda::std::enable_if_t< //
901
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
902
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD],
903
+ ValueT (&values)[ITEMS_PER_THREAD],
904
+ DecomposerT decomposer,
905
+ int begin_bit,
906
+ int end_bit)
907
+ {
908
+ SortBlocked(
909
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
910
+ }
911
+
912
+ //! @rst
913
+ //! Performs an ascending block-wide radix sort over a
914
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
915
+ //!
916
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
917
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
918
+ //! with a temporary value array that enumerates the key indices. The reordered indices
919
+ //! can then be used as a gather-vector for exchanging other associated tile data through
920
+ //! shared memory.
921
+ //! * @granularity
922
+ //! * @smemreuse
923
+ //!
924
+ //! Snippet
925
+ //! ==========================================================================
926
+ //!
927
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
928
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
929
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
930
+ //! tuple of references to relevant members of the key.
931
+ //!
932
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
933
+ //! :language: c++
934
+ //! :dedent:
935
+ //! :start-after: example-begin custom-type
936
+ //! :end-before: example-end custom-type
937
+ //!
938
+ //! The code snippet below illustrates a sort of 6 keys and values that
939
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
940
+ //! where each thread owns 3 consecutive pairs.
941
+ //!
942
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
943
+ //! :language: c++
944
+ //! :dedent:
945
+ //! :start-after: example-begin pairs
946
+ //! :end-before: example-end pairs
947
+ //!
948
+ //! @endrst
949
+ //!
950
+ //! @tparam DecomposerT
951
+ //! **[inferred]** Type of a callable object responsible for decomposing a
952
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
953
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
954
+ //! The leftmost element of the tuple is considered the most significant.
955
+ //! The call operator must not modify members of the key.
956
+ //!
957
+ //! @param[in,out] keys
958
+ //! Keys to sort
959
+ //!
960
+ //! @param[in,out] values
961
+ //! Values to sort
962
+ //!
963
+ //! @param decomposer
964
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
965
+ //! references to its constituent arithmetic types. The leftmost element of
966
+ //! the tuple is considered the most significant. The call operator must not
967
+ //! modify members of the key.
968
+ template <class DecomposerT>
969
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
970
+ ::cuda::std::enable_if_t< //
971
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
972
+ Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
973
+ {
974
+ Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
975
+ }
976
+
977
+ //! @rst
978
+ //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
979
+ //! of keys.
980
+ //!
981
+ //! - @granularity
982
+ //! - @smemreuse
983
+ //!
984
+ //! Snippet
985
+ //! +++++++
986
+ //!
987
+ //! The code snippet below illustrates a sort of 512 integer keys that
988
+ //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
989
+ //! where each thread owns 4 consecutive keys.
990
+ //!
991
+ //! .. code-block:: c++
992
+ //!
993
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
994
+ //!
995
+ //! __global__ void ExampleKernel(...)
996
+ //! {
997
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
998
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
999
+ //!
1000
+ //! // Allocate shared memory for BlockRadixSort
1001
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1002
+ //!
1003
+ //! // Obtain a segment of consecutive items that are blocked across threads
1004
+ //! int thread_keys[4];
1005
+ //! ...
1006
+ //!
1007
+ //! // Collectively sort the keys
1008
+ //! BlockRadixSort(temp_storage).Sort(thread_keys);
1009
+ //!
1010
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1011
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1012
+ //! The corresponding output ``thread_keys`` in those threads will be
1013
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1014
+ //!
1015
+ //! @endrst
1016
+ //!
1017
+ //! @param[in,out] keys
1018
+ //! Keys to sort
1019
+ //!
1020
+ //! @param[in] begin_bit
1021
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1022
+ //!
1023
+ //! @param[in] end_bit
1024
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1025
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1026
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1027
+ {
1028
+ NullType values[ITEMS_PER_THREAD];
1029
+
1030
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1031
+ }
1032
+
1033
+ //! @rst
1034
+ //! Performs a descending block-wide radix sort over a
1035
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1036
+ //!
1037
+ //! * @granularity
1038
+ //! * @smemreuse
1039
+ //!
1040
+ //! Snippet
1041
+ //! ==========================================================================
1042
+ //!
1043
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1044
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1045
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1046
+ //! tuple of references to relevant members of the key.
1047
+ //!
1048
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1049
+ //! :language: c++
1050
+ //! :dedent:
1051
+ //! :start-after: example-begin custom-type
1052
+ //! :end-before: example-end custom-type
1053
+ //!
1054
+ //! The code snippet below illustrates a sort of 2 keys that
1055
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1056
+ //! where each thread owns 1 key.
1057
+ //!
1058
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1059
+ //! :language: c++
1060
+ //! :dedent:
1061
+ //! :start-after: example-begin keys-descending-bits
1062
+ //! :end-before: example-end keys-descending-bits
1063
+ //!
1064
+ //! @endrst
1065
+ //!
1066
+ //! @tparam DecomposerT
1067
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1068
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1069
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1070
+ //! The leftmost element of the tuple is considered the most significant.
1071
+ //! The call operator must not modify members of the key.
1072
+ //!
1073
+ //! @param[in,out] keys
1074
+ //! Keys to sort
1075
+ //!
1076
+ //! @param decomposer
1077
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1078
+ //! references to its constituent arithmetic types. The leftmost element of
1079
+ //! the tuple is considered the most significant. The call operator must not
1080
+ //! modify members of the key.
1081
+ //!
1082
+ //! @param[in] begin_bit
1083
+ //! The least-significant bit index (inclusive) needed for
1084
+ //! key comparison
1085
+ //!
1086
+ //! @param[in] end_bit
1087
+ //! The most-significant bit index (exclusive) needed for key
1088
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1089
+ template <class DecomposerT>
1090
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1091
+ ::cuda::std::enable_if_t< //
1092
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1093
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1094
+ {
1095
+ NullType values[ITEMS_PER_THREAD];
1096
+
1097
+ SortBlocked(
1098
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1099
+ }
1100
+
1101
+ //! @rst
1102
+ //! Performs a descending block-wide radix sort over a
1103
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
1104
+ //!
1105
+ //! * @granularity
1106
+ //! * @smemreuse
1107
+ //!
1108
+ //! Snippet
1109
+ //! ==========================================================================
1110
+ //!
1111
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1112
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1113
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1114
+ //! tuple of references to relevant members of the key.
1115
+ //!
1116
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1117
+ //! :language: c++
1118
+ //! :dedent:
1119
+ //! :start-after: example-begin custom-type
1120
+ //! :end-before: example-end custom-type
1121
+ //!
1122
+ //! The code snippet below illustrates a sort of 6 keys that
1123
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1124
+ //! where each thread owns 3 consecutive keys.
1125
+ //!
1126
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1127
+ //! :language: c++
1128
+ //! :dedent:
1129
+ //! :start-after: example-begin keys-descending
1130
+ //! :end-before: example-end keys-descending
1131
+ //!
1132
+ //! @endrst
1133
+ //!
1134
+ //! @tparam DecomposerT
1135
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1136
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1137
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1138
+ //! The leftmost element of the tuple is considered the most significant.
1139
+ //! The call operator must not modify members of the key.
1140
+ //!
1141
+ //! @param[in,out] keys
1142
+ //! Keys to sort
1143
+ //!
1144
+ //! @param decomposer
1145
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1146
+ //! references to its constituent arithmetic types. The leftmost element of
1147
+ //! the tuple is considered the most significant. The call operator must not
1148
+ //! modify members of the key.
1149
+ template <class DecomposerT>
1150
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1151
+ ::cuda::std::enable_if_t< //
1152
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1153
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1154
+ {
1155
+ NullType values[ITEMS_PER_THREAD];
1156
+
1157
+ SortBlocked(
1158
+ keys,
1159
+ values,
1160
+ 0,
1161
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1162
+ ::cuda::std::true_type(),
1163
+ detail::bool_constant_v<KEYS_ONLY>,
1164
+ decomposer);
1165
+ }
1166
+
1167
+ //! @rst
1168
+ //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1169
+ //! of keys and values.
1170
+ //!
1171
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1172
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1173
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1174
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1175
+ //! shared memory.
1176
+ //! - @granularity
1177
+ //! - @smemreuse
1178
+ //!
1179
+ //! Snippet
1180
+ //! +++++++
1181
+ //!
1182
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1183
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
1184
+ //! where each thread owns 4 consecutive pairs.
1185
+ //!
1186
+ //! .. code-block:: c++
1187
+ //!
1188
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1189
+ //!
1190
+ //! __global__ void ExampleKernel(...)
1191
+ //! {
1192
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1193
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1194
+ //!
1195
+ //! // Allocate shared memory for BlockRadixSort
1196
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1197
+ //!
1198
+ //! // Obtain a segment of consecutive items that are blocked across threads
1199
+ //! int thread_keys[4];
1200
+ //! int thread_values[4];
1201
+ //! ...
1202
+ //!
1203
+ //! // Collectively sort the keys and values among block threads
1204
+ //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
1205
+ //!
1206
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1207
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
1208
+ //! corresponding output ``thread_keys`` in those threads will be
1209
+ //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
1210
+ //!
1211
+ //! @endrst
1212
+ //!
1213
+ //! @param[in,out] keys
1214
+ //! Keys to sort
1215
+ //!
1216
+ //! @param[in,out] values
1217
+ //! Values to sort
1218
+ //!
1219
+ //! @param[in] begin_bit
1220
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1221
+ //!
1222
+ //! @param[in] end_bit
1223
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1224
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
1225
+ KeyT (&keys)[ITEMS_PER_THREAD],
1226
+ ValueT (&values)[ITEMS_PER_THREAD],
1227
+ int begin_bit = 0,
1228
+ int end_bit = sizeof(KeyT) * 8)
1229
+ {
1230
+ SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1231
+ }
1232
+
1233
+ //! @rst
1234
+ //! Performs a descending block-wide radix sort over a
1235
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1236
+ //!
1237
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1238
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1239
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1240
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1241
+ //! shared memory.
1242
+ //! * @granularity
1243
+ //! * @smemreuse
1244
+ //!
1245
+ //! Snippet
1246
+ //! ==========================================================================
1247
+ //!
1248
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1249
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1250
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1251
+ //! tuple of references to relevant members of the key.
1252
+ //!
1253
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1254
+ //! :language: c++
1255
+ //! :dedent:
1256
+ //! :start-after: example-begin custom-type
1257
+ //! :end-before: example-end custom-type
1258
+ //!
1259
+ //! The code snippet below illustrates a sort of 2 pairs that
1260
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1261
+ //! where each thread owns 1 pair.
1262
+ //!
1263
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1264
+ //! :language: c++
1265
+ //! :dedent:
1266
+ //! :start-after: example-begin pairs-descending-bits
1267
+ //! :end-before: example-end pairs-descending-bits
1268
+ //!
1269
+ //! @endrst
1270
+ //!
1271
+ //! @tparam DecomposerT
1272
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1273
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1274
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1275
+ //! The leftmost element of the tuple is considered the most significant.
1276
+ //! The call operator must not modify members of the key.
1277
+ //!
1278
+ //! @param[in,out] keys
1279
+ //! Keys to sort
1280
+ //!
1281
+ //! @param[in,out] values
1282
+ //! Values to sort
1283
+ //!
1284
+ //! @param decomposer
1285
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1286
+ //! references to its constituent arithmetic types. The leftmost element of
1287
+ //! the tuple is considered the most significant. The call operator must not
1288
+ //! modify members of the key.
1289
+ //!
1290
+ //! @param[in] begin_bit
1291
+ //! The least-significant bit index (inclusive) needed for
1292
+ //! key comparison
1293
+ //!
1294
+ //! @param[in] end_bit
1295
+ //! The most-significant bit index (exclusive) needed for key
1296
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1297
+ template <class DecomposerT>
1298
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1299
+ ::cuda::std::enable_if_t< //
1300
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1301
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
1302
+ ValueT (&values)[ITEMS_PER_THREAD],
1303
+ DecomposerT decomposer,
1304
+ int begin_bit,
1305
+ int end_bit)
1306
+ {
1307
+ SortBlocked(
1308
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1309
+ }
1310
+
1311
+ //! @rst
1312
+ //! Performs a descending block-wide radix sort over a
1313
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
1314
+ //!
1315
+ //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1316
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1317
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1318
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1319
+ //! shared memory.
1320
+ //! * @granularity
1321
+ //! * @smemreuse
1322
+ //!
1323
+ //! Snippet
1324
+ //! ==========================================================================
1325
+ //!
1326
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1327
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1328
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1329
+ //! tuple of references to relevant members of the key.
1330
+ //!
1331
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1332
+ //! :language: c++
1333
+ //! :dedent:
1334
+ //! :start-after: example-begin custom-type
1335
+ //! :end-before: example-end custom-type
1336
+ //!
1337
+ //! The code snippet below illustrates a sort of 6 keys and values that
1338
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1339
+ //! where each thread owns 3 consecutive pairs.
1340
+ //!
1341
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1342
+ //! :language: c++
1343
+ //! :dedent:
1344
+ //! :start-after: example-begin pairs-descending
1345
+ //! :end-before: example-end pairs-descending
1346
+ //!
1347
+ //! @endrst
1348
+ //!
1349
+ //! @tparam DecomposerT
1350
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1351
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1352
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1353
+ //! The leftmost element of the tuple is considered the most significant.
1354
+ //! The call operator must not modify members of the key.
1355
+ //!
1356
+ //! @param[in,out] keys
1357
+ //! Keys to sort
1358
+ //!
1359
+ //! @param[in,out] values
1360
+ //! Values to sort
1361
+ //!
1362
+ //! @param decomposer
1363
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1364
+ //! references to its constituent arithmetic types. The leftmost element of
1365
+ //! the tuple is considered the most significant. The call operator must not
1366
+ //! modify members of the key.
1367
+ template <class DecomposerT>
1368
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1369
+ ::cuda::std::enable_if_t< //
1370
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1371
+ SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1372
+ {
1373
+ SortBlocked(
1374
+ keys,
1375
+ values,
1376
+ 0,
1377
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1378
+ ::cuda::std::true_type(),
1379
+ detail::bool_constant_v<KEYS_ONLY>,
1380
+ decomposer);
1381
+ }
1382
+
1383
+ //! @} end member group
1384
+ //! @name Sorting (blocked arrangement -> striped arrangement)
1385
+ //! @{
1386
+
1387
+ //! @rst
1388
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
1389
+ //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1390
+ //!
1391
+ //! - @granularity
1392
+ //! - @smemreuse
1393
+ //!
1394
+ //! Snippet
1395
+ //! +++++++
1396
+ //!
1397
+ //! The code snippet below illustrates a sort of 512 integer keys that
1398
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1399
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1400
+ //!
1401
+ //! .. code-block:: c++
1402
+ //!
1403
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1404
+ //!
1405
+ //! __global__ void ExampleKernel(...)
1406
+ //! {
1407
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1408
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1409
+ //!
1410
+ //! // Allocate shared memory for BlockRadixSort
1411
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1412
+ //!
1413
+ //! // Obtain a segment of consecutive items that are blocked across threads
1414
+ //! int thread_keys[4];
1415
+ //! ...
1416
+ //!
1417
+ //! // Collectively sort the keys
1418
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1419
+ //!
1420
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1421
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1422
+ //! The corresponding output ``thread_keys`` in those threads will be
1423
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1424
+ //!
1425
+ //! @endrst
1426
+ //!
1427
+ //! @param[in,out] keys
1428
+ //! Keys to sort
1429
+ //!
1430
+ //! @param[in] begin_bit
1431
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1432
+ //!
1433
+ //! @param[in] end_bit
1434
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1435
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1436
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1437
+ {
1438
+ NullType values[ITEMS_PER_THREAD];
1439
+
1440
+ SortBlockedToStriped(
1441
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1442
+ }
1443
+
1444
+ //! @rst
1445
+ //! Performs an ascending block-wide radix sort over a
1446
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1447
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1448
+ //!
1449
+ //! * @granularity
1450
+ //! * @smemreuse
1451
+ //!
1452
+ //! Snippet
1453
+ //! ==========================================================================
1454
+ //!
1455
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1456
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1457
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1458
+ //! tuple of references to relevant members of the key.
1459
+ //!
1460
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1461
+ //! :language: c++
1462
+ //! :dedent:
1463
+ //! :start-after: example-begin custom-type
1464
+ //! :end-before: example-end custom-type
1465
+ //!
1466
+ //! The code snippet below illustrates a sort of 4 keys that
1467
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1468
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1469
+ //!
1470
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1471
+ //! :language: c++
1472
+ //! :dedent:
1473
+ //! :start-after: example-begin keys-striped-bits
1474
+ //! :end-before: example-end keys-striped-bits
1475
+ //!
1476
+ //! @endrst
1477
+ //!
1478
+ //! @tparam DecomposerT
1479
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1480
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1481
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1482
+ //! The leftmost element of the tuple is considered the most significant.
1483
+ //! The call operator must not modify members of the key.
1484
+ //!
1485
+ //! @param[in,out] keys
1486
+ //! Keys to sort
1487
+ //!
1488
+ //! @param decomposer
1489
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1490
+ //! references to its constituent arithmetic types. The leftmost element of
1491
+ //! the tuple is considered the most significant. The call operator must not
1492
+ //! modify members of the key.
1493
+ //!
1494
+ //! @param[in] begin_bit
1495
+ //! The least-significant bit index (inclusive) needed for
1496
+ //! key comparison
1497
+ //!
1498
+ //! @param[in] end_bit
1499
+ //! The most-significant bit index (exclusive) needed for key
1500
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1501
+ template <class DecomposerT>
1502
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1503
+ ::cuda::std::enable_if_t< //
1504
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1505
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1506
+ {
1507
+ NullType values[ITEMS_PER_THREAD];
1508
+
1509
+ SortBlockedToStriped(
1510
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1511
+ }
1512
+
1513
+ //! @rst
1514
+ //! Performs an ascending block-wide radix sort over a
1515
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1516
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1517
+ //!
1518
+ //! * @granularity
1519
+ //! * @smemreuse
1520
+ //!
1521
+ //! Snippet
1522
+ //! ==========================================================================
1523
+ //!
1524
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1525
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1526
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1527
+ //! tuple of references to relevant members of the key.
1528
+ //!
1529
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1530
+ //! :language: c++
1531
+ //! :dedent:
1532
+ //! :start-after: example-begin custom-type
1533
+ //! :end-before: example-end custom-type
1534
+ //!
1535
+ //! The code snippet below illustrates a sort of 6 keys that
1536
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1537
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1538
+ //!
1539
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1540
+ //! :language: c++
1541
+ //! :dedent:
1542
+ //! :start-after: example-begin keys-striped
1543
+ //! :end-before: example-end keys-striped
1544
+ //!
1545
+ //! @endrst
1546
+ //!
1547
+ //! @tparam DecomposerT
1548
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1549
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1550
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1551
+ //! The leftmost element of the tuple is considered the most significant.
1552
+ //! The call operator must not modify members of the key.
1553
+ //!
1554
+ //! @param[in,out] keys
1555
+ //! Keys to sort
1556
+ //!
1557
+ //! @param decomposer
1558
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1559
+ //! references to its constituent arithmetic types. The leftmost element of
1560
+ //! the tuple is considered the most significant. The call operator must not
1561
+ //! modify members of the key.
1562
+ template <class DecomposerT>
1563
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1564
+ ::cuda::std::enable_if_t< //
1565
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1566
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1567
+ {
1568
+ NullType values[ITEMS_PER_THREAD];
1569
+
1570
+ SortBlockedToStriped(
1571
+ keys,
1572
+ values,
1573
+ 0,
1574
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1575
+ ::cuda::std::false_type(),
1576
+ detail::bool_constant_v<KEYS_ONLY>,
1577
+ decomposer);
1578
+ }
1579
+
1580
+ //! @rst
1581
+ //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
1582
+ //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1583
+ //!
1584
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1585
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1586
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1587
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1588
+ //! shared memory.
1589
+ //! - @granularity
1590
+ //! - @smemreuse
1591
+ //!
1592
+ //! Snippet
1593
+ //! +++++++
1594
+ //!
1595
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1596
+ //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
1597
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1598
+ //!
1599
+ //! .. code-block:: c++
1600
+ //!
1601
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1602
+ //!
1603
+ //! __global__ void ExampleKernel(...)
1604
+ //! {
1605
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
1606
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
1607
+ //!
1608
+ //! // Allocate shared memory for BlockRadixSort
1609
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1610
+ //!
1611
+ //! // Obtain a segment of consecutive items that are blocked across threads
1612
+ //! int thread_keys[4];
1613
+ //! int thread_values[4];
1614
+ //! ...
1615
+ //!
1616
+ //! // Collectively sort the keys and values among block threads
1617
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
1618
+ //!
1619
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1620
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1621
+ //! The corresponding output ``thread_keys`` in those threads will be
1622
+ //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
1623
+ //!
1624
+ //! @endrst
1625
+ //!
1626
+ //! @param[in,out] keys
1627
+ //! Keys to sort
1628
+ //!
1629
+ //! @param[in,out] values
1630
+ //! Values to sort
1631
+ //!
1632
+ //! @param[in] begin_bit
1633
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1634
+ //!
1635
+ //! @param[in] end_bit
1636
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1637
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
1638
+ KeyT (&keys)[ITEMS_PER_THREAD],
1639
+ ValueT (&values)[ITEMS_PER_THREAD],
1640
+ int begin_bit = 0,
1641
+ int end_bit = sizeof(KeyT) * 8)
1642
+ {
1643
+ SortBlockedToStriped(
1644
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
1645
+ }
1646
+
1647
+ //! @rst
1648
+ //! Performs an ascending block-wide radix sort over a
1649
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1650
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1651
+ //!
1652
+ //! * @granularity
1653
+ //! * @smemreuse
1654
+ //!
1655
+ //! Snippet
1656
+ //! ==========================================================================
1657
+ //!
1658
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1659
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1660
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1661
+ //! tuple of references to relevant members of the key.
1662
+ //!
1663
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1664
+ //! :language: c++
1665
+ //! :dedent:
1666
+ //! :start-after: example-begin custom-type
1667
+ //! :end-before: example-end custom-type
1668
+ //!
1669
+ //! The code snippet below illustrates a sort of 4 pairs that
1670
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1671
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
1672
+ //!
1673
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1674
+ //! :language: c++
1675
+ //! :dedent:
1676
+ //! :start-after: example-begin pairs-striped-bits
1677
+ //! :end-before: example-end pairs-striped-bits
1678
+ //!
1679
+ //! @endrst
1680
+ //!
1681
+ //! @tparam DecomposerT
1682
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1683
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1684
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1685
+ //! The leftmost element of the tuple is considered the most significant.
1686
+ //! The call operator must not modify members of the key.
1687
+ //!
1688
+ //! @param[in,out] keys
1689
+ //! Keys to sort
1690
+ //!
1691
+ //! @param[in,out] values
1692
+ //! Values to sort
1693
+ //!
1694
+ //! @param decomposer
1695
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1696
+ //! references to its constituent arithmetic types. The leftmost element of
1697
+ //! the tuple is considered the most significant. The call operator must not
1698
+ //! modify members of the key.
1699
+ //!
1700
+ //! @param[in] begin_bit
1701
+ //! The least-significant bit index (inclusive) needed for
1702
+ //! key comparison
1703
+ //!
1704
+ //! @param[in] end_bit
1705
+ //! The most-significant bit index (exclusive) needed for key
1706
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1707
+ template <class DecomposerT>
1708
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1709
+ ::cuda::std::enable_if_t< //
1710
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1711
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
1712
+ ValueT (&values)[ITEMS_PER_THREAD],
1713
+ DecomposerT decomposer,
1714
+ int begin_bit,
1715
+ int end_bit)
1716
+ {
1717
+ SortBlockedToStriped(
1718
+ keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1719
+ }
1720
+
1721
+ //! @rst
1722
+ //! Performs an ascending block-wide radix sort over a
1723
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
1724
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1725
+ //!
1726
+ //! * @granularity
1727
+ //! * @smemreuse
1728
+ //!
1729
+ //! Snippet
1730
+ //! ==========================================================================
1731
+ //!
1732
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1733
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1734
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1735
+ //! tuple of references to relevant members of the key.
1736
+ //!
1737
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1738
+ //! :language: c++
1739
+ //! :dedent:
1740
+ //! :start-after: example-begin custom-type
1741
+ //! :end-before: example-end custom-type
1742
+ //!
1743
+ //! The code snippet below illustrates a sort of 6 pairs that
1744
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1745
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
1746
+ //!
1747
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1748
+ //! :language: c++
1749
+ //! :dedent:
1750
+ //! :start-after: example-begin pairs-striped
1751
+ //! :end-before: example-end pairs-striped
1752
+ //!
1753
+ //! @endrst
1754
+ //!
1755
+ //! @tparam DecomposerT
1756
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1757
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1758
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1759
+ //! The leftmost element of the tuple is considered the most significant.
1760
+ //! The call operator must not modify members of the key.
1761
+ //!
1762
+ //! @param[in,out] keys
1763
+ //! Keys to sort
1764
+ //!
1765
+ //! @param[in,out] values
1766
+ //! Values to sort
1767
+ //!
1768
+ //! @param decomposer
1769
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1770
+ //! references to its constituent arithmetic types. The leftmost element of
1771
+ //! the tuple is considered the most significant. The call operator must not
1772
+ //! modify members of the key.
1773
+ template <class DecomposerT>
1774
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1775
+ ::cuda::std::enable_if_t< //
1776
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1777
+ SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
1778
+ {
1779
+ SortBlockedToStriped(
1780
+ keys,
1781
+ values,
1782
+ 0,
1783
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1784
+ ::cuda::std::false_type(),
1785
+ detail::bool_constant_v<KEYS_ONLY>,
1786
+ decomposer);
1787
+ }
1788
+
1789
+ //! @rst
1790
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1791
+ //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
1792
+ //!
1793
+ //! - @granularity
1794
+ //! - @smemreuse
1795
+ //!
1796
+ //! Snippet
1797
+ //! +++++++
1798
+ //!
1799
+ //! The code snippet below illustrates a sort of 512 integer keys that
1800
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1801
+ //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
1802
+ //!
1803
+ //! .. code-block:: c++
1804
+ //!
1805
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
1806
+ //!
1807
+ //! __global__ void ExampleKernel(...)
1808
+ //! {
1809
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
1810
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
1811
+ //!
1812
+ //! // Allocate shared memory for BlockRadixSort
1813
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
1814
+ //!
1815
+ //! // Obtain a segment of consecutive items that are blocked across threads
1816
+ //! int thread_keys[4];
1817
+ //! ...
1818
+ //!
1819
+ //! // Collectively sort the keys
1820
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
1821
+ //!
1822
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
1823
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
1824
+ //! The corresponding output ``thread_keys`` in those threads will be
1825
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
1826
+ //!
1827
+ //! @endrst
1828
+ //!
1829
+ //! @param[in,out] keys
1830
+ //! Keys to sort
1831
+ //!
1832
+ //! @param[in] begin_bit
1833
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
1834
+ //!
1835
+ //! @param[in] end_bit
1836
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
1837
+ _CCCL_DEVICE _CCCL_FORCEINLINE void
1838
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
1839
+ {
1840
+ NullType values[ITEMS_PER_THREAD];
1841
+
1842
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
1843
+ }
1844
+
1845
+ //! @rst
1846
+ //! Performs a descending block-wide radix sort over a
1847
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1848
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1849
+ //!
1850
+ //! * @granularity
1851
+ //! * @smemreuse
1852
+ //!
1853
+ //! Snippet
1854
+ //! ==========================================================================
1855
+ //!
1856
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1857
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1858
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1859
+ //! tuple of references to relevant members of the key.
1860
+ //!
1861
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1862
+ //! :language: c++
1863
+ //! :dedent:
1864
+ //! :start-after: example-begin custom-type
1865
+ //! :end-before: example-end custom-type
1866
+ //!
1867
+ //! The code snippet below illustrates a sort of 4 keys that
1868
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1869
+ //! where each thread owns 2 consecutive keys. The final partitioning is striped.
1870
+ //!
1871
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1872
+ //! :language: c++
1873
+ //! :dedent:
1874
+ //! :start-after: example-begin keys-striped-descending-bits
1875
+ //! :end-before: example-end keys-striped-descending-bits
1876
+ //!
1877
+ //! @endrst
1878
+ //!
1879
+ //! @tparam DecomposerT
1880
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1881
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1882
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1883
+ //! The leftmost element of the tuple is considered the most significant.
1884
+ //! The call operator must not modify members of the key.
1885
+ //!
1886
+ //! @param[in,out] keys
1887
+ //! Keys to sort
1888
+ //!
1889
+ //! @param decomposer
1890
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1891
+ //! references to its constituent arithmetic types. The leftmost element of
1892
+ //! the tuple is considered the most significant. The call operator must not
1893
+ //! modify members of the key.
1894
+ //!
1895
+ //! @param[in] begin_bit
1896
+ //! The least-significant bit index (inclusive) needed for
1897
+ //! key comparison
1898
+ //!
1899
+ //! @param[in] end_bit
1900
+ //! The most-significant bit index (exclusive) needed for key
1901
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
1902
+ template <class DecomposerT>
1903
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1904
+ ::cuda::std::enable_if_t< //
1905
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1906
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
1907
+ {
1908
+ NullType values[ITEMS_PER_THREAD];
1909
+
1910
+ SortBlockedToStriped(
1911
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
1912
+ }
1913
+
1914
+ //! @rst
1915
+ //! Performs a descending block-wide radix sort over a
1916
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
1917
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
1918
+ //!
1919
+ //! * @granularity
1920
+ //! * @smemreuse
1921
+ //!
1922
+ //! Snippet
1923
+ //! ==========================================================================
1924
+ //!
1925
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
1926
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
1927
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
1928
+ //! tuple of references to relevant members of the key.
1929
+ //!
1930
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1931
+ //! :language: c++
1932
+ //! :dedent:
1933
+ //! :start-after: example-begin custom-type
1934
+ //! :end-before: example-end custom-type
1935
+ //!
1936
+ //! The code snippet below illustrates a sort of 6 keys that
1937
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
1938
+ //! where each thread owns 3 consecutive keys. The final partitioning is striped.
1939
+ //!
1940
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
1941
+ //! :language: c++
1942
+ //! :dedent:
1943
+ //! :start-after: example-begin keys-striped-descending
1944
+ //! :end-before: example-end keys-striped-descending
1945
+ //!
1946
+ //! @endrst
1947
+ //!
1948
+ //! @tparam DecomposerT
1949
+ //! **[inferred]** Type of a callable object responsible for decomposing a
1950
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
1951
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
1952
+ //! The leftmost element of the tuple is considered the most significant.
1953
+ //! The call operator must not modify members of the key.
1954
+ //!
1955
+ //! @param[in,out] keys
1956
+ //! Keys to sort
1957
+ //!
1958
+ //! @param decomposer
1959
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
1960
+ //! references to its constituent arithmetic types. The leftmost element of
1961
+ //! the tuple is considered the most significant. The call operator must not
1962
+ //! modify members of the key.
1963
+ template <class DecomposerT>
1964
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
1965
+ ::cuda::std::enable_if_t< //
1966
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
1967
+ SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
1968
+ {
1969
+ NullType values[ITEMS_PER_THREAD];
1970
+
1971
+ SortBlockedToStriped(
1972
+ keys,
1973
+ values,
1974
+ 0,
1975
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
1976
+ ::cuda::std::true_type(),
1977
+ detail::bool_constant_v<KEYS_ONLY>,
1978
+ decomposer);
1979
+ }
1980
+
1981
+ //! @rst
1982
+ //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
1983
+ //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
1984
+ //!
1985
+ //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
1986
+ //! more than one tile of values, simply perform a key-value sort of the keys paired
1987
+ //! with a temporary value array that enumerates the key indices. The reordered indices
1988
+ //! can then be used as a gather-vector for exchanging other associated tile data through
1989
+ //! shared memory.
1990
+ //! - @granularity
1991
+ //! - @smemreuse
1992
+ //!
1993
+ //! Snippet
1994
+ //! +++++++
1995
+ //!
1996
+ //! The code snippet below illustrates a sort of 512 integer keys and values that
1997
+ //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
1998
+ //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
1999
+ //!
2000
+ //! .. code-block:: c++
2001
+ //!
2002
+ //! #include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
2003
+ //!
2004
+ //! __global__ void ExampleKernel(...)
2005
+ //! {
2006
+ //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
2007
+ //! using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
2008
+ //!
2009
+ //! // Allocate shared memory for BlockRadixSort
2010
+ //! __shared__ typename BlockRadixSort::TempStorage temp_storage;
2011
+ //!
2012
+ //! // Obtain a segment of consecutive items that are blocked across threads
2013
+ //! int thread_keys[4];
2014
+ //! int thread_values[4];
2015
+ //! ...
2016
+ //!
2017
+ //! // Collectively sort the keys and values among block threads
2018
+ //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
2019
+ //!
2020
+ //! Suppose the set of input ``thread_keys`` across the block of threads is
2021
+ //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
2022
+ //! The corresponding output ``thread_keys`` in those threads will be
2023
+ //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
2024
+ //!
2025
+ //! @endrst
2026
+ //!
2027
+ //! @param[in,out] keys
2028
+ //! Keys to sort
2029
+ //!
2030
+ //! @param[in,out] values
2031
+ //! Values to sort
2032
+ //!
2033
+ //! @param[in] begin_bit
2034
+ //! **[optional]** The beginning (least-significant) bit index needed for key comparison
2035
+ //!
2036
+ //! @param[in] end_bit
2037
+ //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
2038
+ _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
2039
+ KeyT (&keys)[ITEMS_PER_THREAD],
2040
+ ValueT (&values)[ITEMS_PER_THREAD],
2041
+ int begin_bit = 0,
2042
+ int end_bit = sizeof(KeyT) * 8)
2043
+ {
2044
+ SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
2045
+ }
2046
+
2047
+ //! @rst
2048
+ //! Performs a descending block-wide radix sort over a
2049
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2050
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2051
+ //!
2052
+ //! * @granularity
2053
+ //! * @smemreuse
2054
+ //!
2055
+ //! Snippet
2056
+ //! ==========================================================================
2057
+ //!
2058
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2059
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2060
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2061
+ //! tuple of references to relevant members of the key.
2062
+ //!
2063
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2064
+ //! :language: c++
2065
+ //! :dedent:
2066
+ //! :start-after: example-begin custom-type
2067
+ //! :end-before: example-end custom-type
2068
+ //!
2069
+ //! The code snippet below illustrates a sort of 4 keys and values that
2070
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2071
+ //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
2072
+ //!
2073
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2074
+ //! :language: c++
2075
+ //! :dedent:
2076
+ //! :start-after: example-begin pairs-striped-descending-bits
2077
+ //! :end-before: example-end pairs-striped-descending-bits
2078
+ //!
2079
+ //! @endrst
2080
+ //!
2081
+ //! @tparam DecomposerT
2082
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2083
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2084
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2085
+ //! The leftmost element of the tuple is considered the most significant.
2086
+ //! The call operator must not modify members of the key.
2087
+ //!
2088
+ //! @param[in,out] keys
2089
+ //! Keys to sort
2090
+ //!
2091
+ //! @param[in,out] values
2092
+ //! Values to sort
2093
+ //!
2094
+ //! @param decomposer
2095
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2096
+ //! references to its constituent arithmetic types. The leftmost element of
2097
+ //! the tuple is considered the most significant. The call operator must not
2098
+ //! modify members of the key.
2099
+ //!
2100
+ //! @param[in] begin_bit
2101
+ //! The least-significant bit index (inclusive) needed for
2102
+ //! key comparison
2103
+ //!
2104
+ //! @param[in] end_bit
2105
+ //! The most-significant bit index (exclusive) needed for key
2106
+ //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
2107
+ template <class DecomposerT>
2108
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2109
+ ::cuda::std::enable_if_t< //
2110
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2111
+ SortDescendingBlockedToStriped(
2112
+ KeyT (&keys)[ITEMS_PER_THREAD],
2113
+ ValueT (&values)[ITEMS_PER_THREAD],
2114
+ DecomposerT decomposer,
2115
+ int begin_bit,
2116
+ int end_bit)
2117
+ {
2118
+ SortBlockedToStriped(
2119
+ keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
2120
+ }
2121
+
2122
+ //! @rst
2123
+ //! Performs a descending block-wide radix sort over a
2124
+ //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
2125
+ //! :ref:`striped arrangement <flexible-data-arrangement>`.
2126
+ //!
2127
+ //! * @granularity
2128
+ //! * @smemreuse
2129
+ //!
2130
+ //! Snippet
2131
+ //! ==========================================================================
2132
+ //!
2133
+ //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
2134
+ //! ``custom_t`` objects, we have to tell CUB about relevant members of the
2135
+ //! ``custom_t`` type. We do this by providing a decomposer that returns a
2136
+ //! tuple of references to relevant members of the key.
2137
+ //!
2138
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2139
+ //! :language: c++
2140
+ //! :dedent:
2141
+ //! :start-after: example-begin custom-type
2142
+ //! :end-before: example-end custom-type
2143
+ //!
2144
+ //! The code snippet below illustrates a sort of 6 keys and values that
2145
+ //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
2146
+ //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
2147
+ //!
2148
+ //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
2149
+ //! :language: c++
2150
+ //! :dedent:
2151
+ //! :start-after: example-begin pairs-striped-descending
2152
+ //! :end-before: example-end pairs-striped-descending
2153
+ //!
2154
+ //! @endrst
2155
+ //!
2156
+ //! @tparam DecomposerT
2157
+ //! **[inferred]** Type of a callable object responsible for decomposing a
2158
+ //! ``KeyT`` into a tuple of references to its constituent arithmetic types:
2159
+ //! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
2160
+ //! The leftmost element of the tuple is considered the most significant.
2161
+ //! The call operator must not modify members of the key.
2162
+ //!
2163
+ //! @param[in,out] keys
2164
+ //! Keys to sort
2165
+ //!
2166
+ //! @param[in,out] values
2167
+ //! Values to sort
2168
+ //!
2169
+ //! @param decomposer
2170
+ //! Callable object responsible for decomposing a ``KeyT`` into a tuple of
2171
+ //! references to its constituent arithmetic types. The leftmost element of
2172
+ //! the tuple is considered the most significant. The call operator must not
2173
+ //! modify members of the key.
2174
+ template <class DecomposerT>
2175
+ _CCCL_DEVICE _CCCL_FORCEINLINE //
2176
+ ::cuda::std::enable_if_t< //
2177
+ !::cuda::std::is_convertible_v<DecomposerT, int>>
2178
+ SortDescendingBlockedToStriped(
2179
+ KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
2180
+ {
2181
+ SortBlockedToStriped(
2182
+ keys,
2183
+ values,
2184
+ 0,
2185
+ detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
2186
+ ::cuda::std::true_type(),
2187
+ detail::bool_constant_v<KEYS_ONLY>,
2188
+ decomposer);
2189
+ }
2190
+
2191
+ //@} end member group
2192
+ };
2193
+
2194
+ CUB_NAMESPACE_END